diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index c19bb68986f..8190b5d0297 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -33,3 +33,4 @@ ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
+ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 6e71505fc7e..944a73ecc98 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 15b51da8dea..8b802333bda 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 31ae8426763..886b07025cc 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 93367527a86..86df56ada19 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 31cfeaf4ca3..9efac3f1904 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,8 +16,14 @@ cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-/ci/               @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/ISSUE_TEMPLATE/pandas_function_request.md b/.github/ISSUE_TEMPLATE/pandas_function_request.md
index 1cecca72953..19f1377dfe7 100644
--- a/.github/ISSUE_TEMPLATE/pandas_function_request.md
+++ b/.github/ISSUE_TEMPLATE/pandas_function_request.md
@@ -2,7 +2,7 @@
 name: Request a Missing Pandas Function
 about: Request GPU support for a function executed on the CPU in pandas accelerator mode.
 title: "[FEA]"
-labels: "? - Needs Triage, feature request"
+labels: "Needs Triage, feature request, cudf.pandas"
 assignees: ''
 
 ---
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ef2141ed934..6942ef0009d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,10 +69,9 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
-      build-2_28-wheels: "true"
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
@@ -80,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -90,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -102,10 +101,28 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  trigger-pandas-tests:
+    if: inputs.build_type == 'nightly'
+    needs: wheel-build-cudf
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          persist-credentials: false
+      - name: Trigger pandas-tests
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh workflow run pandas-tests.yaml \
+            -f branch=${{ inputs.branch }} \
+            -f sha=${{ inputs.sha }} \
+            -f date=${{ inputs.date }}
diff --git a/.github/workflows/jni-docker-build.yml b/.github/workflows/jni-docker-build.yml
deleted file mode 100644
index 0bdc409d0ab..00000000000
--- a/.github/workflows/jni-docker-build.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: JNI Docker Build
-
-on:
-  workflow_dispatch: # manual trigger only
-
-concurrency:
-  group: jni-docker-build-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  docker-build:
-    if: github.repository == 'rapidsai/cudf'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
-          password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
-
-      - name: Set ENVs
-        run: |
-          echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
-          echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
-
-      - name: Build and Push
-        uses: docker/build-push-action@v3
-        with:
-          push: true
-          file: java/ci/Dockerfile.centos7
-          tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
new file mode 100644
index 00000000000..60544294809
--- /dev/null
+++ b/.github/workflows/pandas-tests.yaml
@@ -0,0 +1,27 @@
+name: Pandas Test Job
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  pandas-tests:
+      # run the Pandas unit tests
+      secrets: inherit
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+      with:
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        build_type: nightly
+        branch: ${{ inputs.branch }}
+        date: ${{ inputs.date }}
+        sha: ${{ inputs.sha }}
+        script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 41bf22cf47f..f9d5976f1fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -20,6 +20,7 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
+      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-cudf
@@ -29,44 +30,43 @@ jobs:
       - devcontainer
       - unit-tests-cudf-pandas
       - pandas-tests
-      #- pandas-tests-diff
-      #- pandas-tests-diff-comment
+      - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -74,24 +74,34 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
+  static-configure:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -101,7 +111,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -111,22 +121,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
-      build-2_28-wheels: "true"
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -135,7 +144,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -143,7 +152,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
@@ -154,7 +163,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -163,42 +172,18 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
       test_summary_show: "none"
-  #pandas-tests-diff:
-  #  # diff the results of running the Pandas unit tests and publish a job summary
-  #  needs: [pandas-tests-main, pandas-tests-pr]
-  #  secrets: inherit
-  #  # This branch exports a `job_output` output that the downstream job reads.
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
-  #  with:
-  #    node_type: cpu4
-  #    build_type: pull-request
-  #    run_script: ci/cudf_pandas_scripts/pandas-tests/diff.sh
-  #pandas-tests-diff-comment:
-  #  # Post comment of pass/fail rate on PR
-  #  runs-on: ubuntu-latest
-  #  needs: pandas-tests-diff
-  #  steps:
-  #    - uses: actions/github-script@v6
-  #      with:
-  #        script: |
-  #          const branch = process.env.GITHUB_REF_NAME;
-  #          const prBranchPattern = new RegExp("^pull-request/[0-9]+$");
-  #          if (!branch.match(prBranchPattern)) {
-  #            throw new Error(`${branch} does not match PR branch pattern.`);
-  #          }
-  #          const summary_url = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
-  #          const prNumber = branch.split("/")[1];
-  #          const summary_comment = `${{ needs.pandas-tests-diff.outputs.job_output }}`;
-  #          github.rest.issues.createComment({
-  #            issue_number: prNumber,
-  #            owner: context.repo.owner,
-  #            repo: context.repo.repo,
-  #            body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n`
-  #          })
+  pandas-tests-diff:
+    # diff the results of running the Pandas unit tests and publish a job summary
+    needs: pandas-tests
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+        node_type: cpu4
+        build_type: pull-request
+        run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml
new file mode 100644
index 00000000000..781264bc55e
--- /dev/null
+++ b/.github/workflows/status.yaml
@@ -0,0 +1,120 @@
+name: Custom GH Status from Workflow Artifacts
+
+on:
+  workflow_run:
+    workflows: ["pr"]
+    types:
+      - completed
+
+jobs:
+  process_artifacts:
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    runs-on: ubuntu-latest
+    outputs:
+      artifact_downloaded: ${{ steps.download_artifact.outputs.artifact_downloaded }}
+    permissions:
+      actions: read
+      checks: read
+      contents: read
+      deployments: read
+      id-token: write
+      issues: read
+      discussions: read
+      packages: read
+      pages: read
+      pull-requests: read
+      repository-projects: read
+      security-events: read
+      statuses: write
+    steps:
+      - name: Download artifact
+        id: download_artifact
+        uses: actions/github-script@v7
+        with:
+          retries: 3
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+            const artifactName = 'gh-status';
+
+            const allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id,
+              });
+            // Find the specific artifact
+            const artifact = allArtifacts.data.artifacts.find(artifact => artifact.name === artifactName);
+            if (!artifact) {
+              core.info(`Artifact "${artifactName}" not found. Exiting safely.`);
+              core.setOutput('artifact_downloaded', 'false');
+              return;
+            }
+            core.setOutput('artifact_downloaded', 'true');
+            // Download the artifact
+            const download = await github.rest.actions.downloadArtifact({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              artifact_id: artifact.id,
+              archive_format: 'zip',
+            });
+
+            // Write the artifact to a file
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/${artifactName}.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        run: unzip 'gh-status.zip'
+
+      - name: Create status
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        uses: actions/github-script@v7
+        env:
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          COMMIT_SHA: ${{ github.event.workflow_run.head_sha }}
+          ATTEMPTS: ${{ github.event.workflow_run.run_attempt }}
+        with:
+          retries: 3
+          script: |
+            // Load the JSON content
+            const contentJSON = require('./gh-status.json');
+            const {
+                job_name: JOB_NAME,
+                context: CUSTOM_CONTEXT = 'Custom CI Status Check',
+                description: CUSTOM_DESCRIPTION = 'Custom CI Status description',
+                target_url: CUSTOM_TARGET_URL,
+                state: CUSTOM_STATE = 'success'
+            } = contentJSON;
+
+            // Fetch all jobs using pagination
+            const jobs = await github.paginate(
+              github.rest.actions.listJobsForWorkflowRun,
+              {
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: process.env.WORKFLOW_RUN_ID,
+              }
+            );
+
+            // Fetch the first job ID from the workflow run
+            const job = jobs.find(job => job.name === JOB_NAME);
+            const JOB_ID = job ? job.id : null;
+
+            // Set default target URL if not defined
+            const targetUrl = CUSTOM_TARGET_URL || `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${process.env.WORKFLOW_RUN_ID}/attempts/${process.env.ATTEMPTS}#summary-${JOB_ID}`;
+
+            console.log("job id: ", JOB_ID);
+            console.log("state: ", CUSTOM_STATE);
+            console.log("target url: ", targetUrl);
+            console.log("description: ", CUSTOM_DESCRIPTION);
+            console.log("context: ", CUSTOM_CONTEXT);
+
+            // Create status
+            await github.rest.repos.createCommitStatus({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                sha: process.env.COMMIT_SHA,
+                state: CUSTOM_STATE,
+                target_url: targetUrl,
+                description: CUSTOM_DESCRIPTION,
+                context: CUSTOM_CONTEXT,
+            });
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index aeb092111a7..170f45e23fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -43,9 +43,18 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
+  static-configure:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -76,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -108,21 +117,10 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
-  pandas-tests:
-    # run the Pandas unit tests
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
-    with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.gitignore b/.gitignore
index 471d4100458..c89fb49697a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,6 +78,7 @@ CMakeFiles/
 Debug
 build/
 cpp/build/
+cpp/examples/*/install/
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
@@ -160,9 +161,6 @@ ENV/
 # Dask
 dask-worker-space/
 
-# protobuf
-**/*_pb2.py
-
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/user_guide/api_docs/api/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ce5d4f93444..2d3ffc287e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,16 +2,18 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
       - id: end-of-file-fixer
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/PyCQA/isort
@@ -22,13 +24,15 @@ repos:
         # project can specify its own first/third-party packages.
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
+        exclude: |
+          (?x)^(^python/cudf_polars/.*)
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.0
+    rev: v0.16.2
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.3.0'
+    rev: 'v1.10.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -36,22 +40,11 @@ repos:
                "python/cudf/cudf",
                "python/custreamz/custreamz",
                "python/cudf_kafka/cudf_kafka",
+               "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
-  - repo: https://github.com/PyCQA/pydocstyle
-    rev: 6.3.0
-    hooks:
-      - id: pydocstyle
-        # https://github.com/PyCQA/pydocstyle/issues/603
-        additional_dependencies: [tomli]
-        args: ["--config=pyproject.toml"]
-        exclude: |
-          (?x)^(
-            ^python/cudf/cudf/pandas/scripts/.*|
-            ^python/cudf/cudf_pandas_tests/.*
-          )
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.1
+    rev: 1.8.5
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -64,7 +57,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.3
+    rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -125,7 +118,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.6
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -136,12 +129,12 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.8.0
+    rev: v1.13.4
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.4.3
     hooks:
       - id: ruff
         files: python/.*$
@@ -152,9 +145,11 @@ repos:
     hooks:
       - id: verify-copyright
         exclude: |
-          (?x)
-              cpp/include/cudf_test/cxxopts[.]hpp$
-
+          (?x)^(
+            cpp/include/cudf_test/cxxopts[.]hpp$|
+            cpp/src/io/parquet/ipc/Message_generated[.]h$|
+            cpp/src/io/parquet/ipc/Schema_generated[.]h$
+          )
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5eb4ac9845b..871ef8ba1df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,309 @@
+# cudf 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Fix docs for IO readers and strings_convert&quot; ([#15872](https://github.com/rapidsai/cudf/pull/15872)) [@vyasr](https://github.com/vyasr)
+- Remove problematic call of index setter to unblock dask-cuda CI ([#15844](https://github.com/rapidsai/cudf/pull/15844)) [@charlesbluca](https://github.com/charlesbluca)
+- Use rapids_cpm_nvtx3 to get same nvtx3 target state as rmm ([#15840](https://github.com/rapidsai/cudf/pull/15840)) [@robertmaynard](https://github.com/robertmaynard)
+- Return boolean from config_host_memory_resource instead of throwing ([#15815](https://github.com/rapidsai/cudf/pull/15815)) [@abellina](https://github.com/abellina)
+- Add temporary dask-cudf workaround for categorical sorting ([#15801](https://github.com/rapidsai/cudf/pull/15801)) [@rjzamora](https://github.com/rjzamora)
+- Fix row group alignment in ORC writer ([#15789](https://github.com/rapidsai/cudf/pull/15789)) [@vuule](https://github.com/vuule)
+- Raise error when sorting by categorical column in dask-cudf ([#15788](https://github.com/rapidsai/cudf/pull/15788)) [@rjzamora](https://github.com/rjzamora)
+- Upgrade `arrow` to 16.1 ([#15787](https://github.com/rapidsai/cudf/pull/15787)) [@galipremsagar](https://github.com/galipremsagar)
+- Add support for `PandasArray` for `pandas&lt;2.1.0` ([#15786](https://github.com/rapidsai/cudf/pull/15786)) [@galipremsagar](https://github.com/galipremsagar)
+- Limit runtime dependency to `libarrow&gt;=16.0.0,&lt;16.1.0a0` ([#15782](https://github.com/rapidsai/cudf/pull/15782)) [@pentschev](https://github.com/pentschev)
+- Fix cat.as_ordered not propogating correct size ([#15780](https://github.com/rapidsai/cudf/pull/15780)) [@mroeschke](https://github.com/mroeschke)
+- Handle mixed-like homogeneous types in `isin` ([#15771](https://github.com/rapidsai/cudf/pull/15771)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix id_vars and value_vars not accepting string scalars in melt ([#15765](https://github.com/rapidsai/cudf/pull/15765)) [@mroeschke](https://github.com/mroeschke)
+- Fix `DatetimeIndex.loc` for all types of ordering cases ([#15761](https://github.com/rapidsai/cudf/pull/15761)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix arrow versioning logic ([#15755](https://github.com/rapidsai/cudf/pull/15755)) [@vyasr](https://github.com/vyasr)
+- Avoid running sanitizer on Java test designed to cause an error ([#15753](https://github.com/rapidsai/cudf/pull/15753)) [@jlowe](https://github.com/jlowe)
+- Handle empty dataframe object with index present in setitem of `loc` ([#15752](https://github.com/rapidsai/cudf/pull/15752)) [@galipremsagar](https://github.com/galipremsagar)
+- Eliminate circular reference in DataFrame/Series.iloc/loc ([#15749](https://github.com/rapidsai/cudf/pull/15749)) [@mroeschke](https://github.com/mroeschke)
+- Cap the absolute row index per pass in parquet chunked reader. ([#15735](https://github.com/rapidsai/cudf/pull/15735)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix `Index.repeat` for `datetime64` types ([#15722](https://github.com/rapidsai/cudf/pull/15722)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix multibyte check for case convert for large strings ([#15721](https://github.com/rapidsai/cudf/pull/15721)) [@davidwendt](https://github.com/davidwendt)
+- Fix `get_loc` to properly fetch results from an index that is in decreasing order ([#15719](https://github.com/rapidsai/cudf/pull/15719)) [@galipremsagar](https://github.com/galipremsagar)
+- Return same type as the original index for `.loc` operations ([#15717](https://github.com/rapidsai/cudf/pull/15717)) [@galipremsagar](https://github.com/galipremsagar)
+- Correct static builds + static arrow ([#15715](https://github.com/rapidsai/cudf/pull/15715)) [@robertmaynard](https://github.com/robertmaynard)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix ColumnAccessor caching of nrows if empty previously ([#15710](https://github.com/rapidsai/cudf/pull/15710)) [@mroeschke](https://github.com/mroeschke)
+- Allow `None` when `nan_as_null=False` in column constructor ([#15709](https://github.com/rapidsai/cudf/pull/15709)) [@galipremsagar](https://github.com/galipremsagar)
+- Refine `CudaTest.testCudaException` in case throwing wrong type of CudaError under aarch64 ([#15706](https://github.com/rapidsai/cudf/pull/15706)) [@sperlingxx](https://github.com/sperlingxx)
+- Fix maxima of categorical column ([#15701](https://github.com/rapidsai/cudf/pull/15701)) [@rjzamora](https://github.com/rjzamora)
+- Add proxy for inplace operations in `cudf.pandas` ([#15695](https://github.com/rapidsai/cudf/pull/15695)) [@galipremsagar](https://github.com/galipremsagar)
+- Make `nan_as_null` behavior consistent across all APIs ([#15692](https://github.com/rapidsai/cudf/pull/15692)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix CI s3 api command to fetch latest results ([#15687](https://github.com/rapidsai/cudf/pull/15687)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `NumpyExtensionArray` proxy type in `cudf.pandas` ([#15686](https://github.com/rapidsai/cudf/pull/15686)) [@galipremsagar](https://github.com/galipremsagar)
+- Properly implement binaryops for proxy types ([#15684](https://github.com/rapidsai/cudf/pull/15684)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix copy assignment and the comparison operator of `rmm_host_allocator` ([#15677](https://github.com/rapidsai/cudf/pull/15677)) [@vuule](https://github.com/vuule)
+- Fix multi-source reading in JSON byte range reader ([#15671](https://github.com/rapidsai/cudf/pull/15671)) [@shrshi](https://github.com/shrshi)
+- Return `int64` when pandas compatible mode is turned on for `get_indexer` ([#15659](https://github.com/rapidsai/cudf/pull/15659)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Index contains for error validations and float vs int comparisons ([#15657](https://github.com/rapidsai/cudf/pull/15657)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve sub-second data for time scalars in column construction ([#15655](https://github.com/rapidsai/cudf/pull/15655)) [@galipremsagar](https://github.com/galipremsagar)
+- Check row limit size in cudf::strings::join_strings ([#15643](https://github.com/rapidsai/cudf/pull/15643)) [@davidwendt](https://github.com/davidwendt)
+- Enable sorting on column with nulls using query-planning ([#15639](https://github.com/rapidsai/cudf/pull/15639)) [@rjzamora](https://github.com/rjzamora)
+- Fix operator precedence problem in Parquet reader ([#15638](https://github.com/rapidsai/cudf/pull/15638)) [@etseidl](https://github.com/etseidl)
+- Fix decoding of dictionary encoded FIXED_LEN_BYTE_ARRAY data in Parquet reader ([#15601](https://github.com/rapidsai/cudf/pull/15601)) [@etseidl](https://github.com/etseidl)
+- Fix debug warnings/errors in from_arrow_device_test.cpp ([#15596](https://github.com/rapidsai/cudf/pull/15596)) [@davidwendt](https://github.com/davidwendt)
+- Add &quot;collect&quot; aggregation support to dask-cudf ([#15593](https://github.com/rapidsai/cudf/pull/15593)) [@rjzamora](https://github.com/rjzamora)
+- Fix categorical-accessor support and testing in dask-cudf ([#15591](https://github.com/rapidsai/cudf/pull/15591)) [@rjzamora](https://github.com/rjzamora)
+- Disable compute-sanitizer usage in CI tests with CUDA&lt;11.6 ([#15584](https://github.com/rapidsai/cudf/pull/15584)) [@davidwendt](https://github.com/davidwendt)
+- Preserve RangeIndex.step in to_arrow/from_arrow ([#15581](https://github.com/rapidsai/cudf/pull/15581)) [@mroeschke](https://github.com/mroeschke)
+- Ignore new cupy warning ([#15574](https://github.com/rapidsai/cudf/pull/15574)) [@vyasr](https://github.com/vyasr)
+- Add cuda-sanitizer-api dependency for test-cpp matrix 11.4 ([#15573](https://github.com/rapidsai/cudf/pull/15573)) [@davidwendt](https://github.com/davidwendt)
+- Allow apply udf to reference global modules in cudf.pandas ([#15569](https://github.com/rapidsai/cudf/pull/15569)) [@mroeschke](https://github.com/mroeschke)
+- Fix deprecation warnings for json legacy reader ([#15563](https://github.com/rapidsai/cudf/pull/15563)) [@davidwendt](https://github.com/davidwendt)
+- Fix millisecond resampling in cudf Python ([#15560](https://github.com/rapidsai/cudf/pull/15560)) [@mroeschke](https://github.com/mroeschke)
+- Rename JSON_READER_OPTION to JSON_READER_OPTION_NVBENCH. ([#15553](https://github.com/rapidsai/cudf/pull/15553)) [@bdice](https://github.com/bdice)
+- Fix a JNI bug in JSON parsing fixup ([#15550](https://github.com/rapidsai/cudf/pull/15550)) [@revans2](https://github.com/revans2)
+- Remove conda channel setup from wheel CI image script. ([#15539](https://github.com/rapidsai/cudf/pull/15539)) [@bdice](https://github.com/bdice)
+- cudf.pandas: Series dt accessor is CombinedDatetimelikeProperties ([#15523](https://github.com/rapidsai/cudf/pull/15523)) [@wence-](https://github.com/wence-)
+- Fix for some compiler warnings in parquet/page_decode.cuh ([#15518](https://github.com/rapidsai/cudf/pull/15518)) [@etseidl](https://github.com/etseidl)
+- Fix exponent overflow in strings-to-double conversion ([#15517](https://github.com/rapidsai/cudf/pull/15517)) [@davidwendt](https://github.com/davidwendt)
+- nanoarrow uses package override for proper pinned versions generation ([#15515](https://github.com/rapidsai/cudf/pull/15515)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove index name overrides in dask-cudf pyarrow table dispatch ([#15514](https://github.com/rapidsai/cudf/pull/15514)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix async synchronization issues in json_column.cu ([#15497](https://github.com/rapidsai/cudf/pull/15497)) [@karthikeyann](https://github.com/karthikeyann)
+- Add new patch to hide more CCCL APIs ([#15493](https://github.com/rapidsai/cudf/pull/15493)) [@vyasr](https://github.com/vyasr)
+- Make improvements in pandas-test reporting ([#15485](https://github.com/rapidsai/cudf/pull/15485)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixed page data truncation in parquet writer under certain conditions. ([#15474](https://github.com/rapidsai/cudf/pull/15474)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Only use data_type constructor with scale for decimal types ([#15472](https://github.com/rapidsai/cudf/pull/15472)) [@wence-](https://github.com/wence-)
+- Avoid &quot;p2p&quot; shuffle as a default when `dask_cudf` is imported ([#15469](https://github.com/rapidsai/cudf/pull/15469)) [@rjzamora](https://github.com/rjzamora)
+- Fix debug build errors from to_arrow_device_test.cpp ([#15463](https://github.com/rapidsai/cudf/pull/15463)) [@davidwendt](https://github.com/davidwendt)
+- Fix base_normalator::integer_sizeof_fn integer dispatch ([#15457](https://github.com/rapidsai/cudf/pull/15457)) [@davidwendt](https://github.com/davidwendt)
+- Allow consumers of static builds to find nanoarrow ([#15456](https://github.com/rapidsai/cudf/pull/15456)) [@robertmaynard](https://github.com/robertmaynard)
+- Allow jit compilation when using a splayed CUDA toolkit ([#15451](https://github.com/rapidsai/cudf/pull/15451)) [@robertmaynard](https://github.com/robertmaynard)
+- Handle case of scan aggregation in groupby-transform ([#15450](https://github.com/rapidsai/cudf/pull/15450)) [@wence-](https://github.com/wence-)
+- Test static builds in CI and fix nanoarrow configure ([#15437](https://github.com/rapidsai/cudf/pull/15437)) [@vyasr](https://github.com/vyasr)
+- Fixes potential race in JSON parser when parsing JSON lines format and when recovering from invalid lines ([#15419](https://github.com/rapidsai/cudf/pull/15419)) [@elstehle](https://github.com/elstehle)
+- Fix errors in chunked ORC writer when no tables were (successfully) written ([#15393](https://github.com/rapidsai/cudf/pull/15393)) [@vuule](https://github.com/vuule)
+- Support implicit array conversion with query-planning enabled ([#15378](https://github.com/rapidsai/cudf/pull/15378)) [@rjzamora](https://github.com/rjzamora)
+- Fix arrow-based round trip of empty dataframes ([#15373](https://github.com/rapidsai/cudf/pull/15373)) [@wence-](https://github.com/wence-)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- Remove boundscheck=False setting in cython files ([#15362](https://github.com/rapidsai/cudf/pull/15362)) [@wence-](https://github.com/wence-)
+- Patch dask-expr `var` logic in dask-cudf ([#15347](https://github.com/rapidsai/cudf/pull/15347)) [@rjzamora](https://github.com/rjzamora)
+- Fix for logical and syntactical errors in libcudf c++ examples ([#15346](https://github.com/rapidsai/cudf/pull/15346)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Disable dask-expr in docs builds. ([#15343](https://github.com/rapidsai/cudf/pull/15343)) [@bdice](https://github.com/bdice)
+- Apply the cuFile error work around to data_sink as well ([#15335](https://github.com/rapidsai/cudf/pull/15335)) [@vuule](https://github.com/vuule)
+- Fix parquet predicate filtering with column projection ([#15113](https://github.com/rapidsai/cudf/pull/15113)) [@karthikeyann](https://github.com/karthikeyann)
+- Check column type equality, handling nested types correctly. ([#14531](https://github.com/rapidsai/cudf/pull/14531)) [@bdice](https://github.com/bdice)
+
+## 📖 Documentation
+
+- Fix docs for IO readers and strings_convert ([#15842](https://github.com/rapidsai/cudf/pull/15842)) [@bdice](https://github.com/bdice)
+- Update cudf.pandas docs for GA ([#15744](https://github.com/rapidsai/cudf/pull/15744)) [@beckernick](https://github.com/beckernick)
+- Add contributing warning about circular imports ([#15691](https://github.com/rapidsai/cudf/pull/15691)) [@er-eis](https://github.com/er-eis)
+- Update libcudf developer guide for strings offsets column ([#15661](https://github.com/rapidsai/cudf/pull/15661)) [@davidwendt](https://github.com/davidwendt)
+- Update developer guide with device_async_resource_ref guidelines ([#15562](https://github.com/rapidsai/cudf/pull/15562)) [@harrism](https://github.com/harrism)
+- DOC: add pandas intersphinx mapping ([#15531](https://github.com/rapidsai/cudf/pull/15531)) [@raybellwaves](https://github.com/raybellwaves)
+- rm-dup-doc in frame.py ([#15530](https://github.com/rapidsai/cudf/pull/15530)) [@raybellwaves](https://github.com/raybellwaves)
+- Update CONTRIBUTING.md to use latest cuda env ([#15467](https://github.com/rapidsai/cudf/pull/15467)) [@raybellwaves](https://github.com/raybellwaves)
+- Doc: interleave columns pandas compat ([#15383](https://github.com/rapidsai/cudf/pull/15383)) [@raybellwaves](https://github.com/raybellwaves)
+- Simplified README Examples ([#15338](https://github.com/rapidsai/cudf/pull/15338)) [@wkaisertexas](https://github.com/wkaisertexas)
+- Add debug tips section to libcudf developer guide ([#15329](https://github.com/rapidsai/cudf/pull/15329)) [@davidwendt](https://github.com/davidwendt)
+- Fix and clarify notes on result ordering ([#13255](https://github.com/rapidsai/cudf/pull/13255)) [@shwina](https://github.com/shwina)
+
+## 🚀 New Features
+
+- Add JNI bindings for zstd compression of NVCOMP. ([#15729](https://github.com/rapidsai/cudf/pull/15729)) [@firestarman](https://github.com/firestarman)
+- Fix spaces around CSV quoted strings ([#15727](https://github.com/rapidsai/cudf/pull/15727)) [@thabetx](https://github.com/thabetx)
+- Add default pinned pool that falls back to new pinned allocations ([#15665](https://github.com/rapidsai/cudf/pull/15665)) [@vuule](https://github.com/vuule)
+- Overhaul ops-codeowners coverage ([#15660](https://github.com/rapidsai/cudf/pull/15660)) [@raydouglass](https://github.com/raydouglass)
+- Concatenate dictionary of objects along axis=1 ([#15623](https://github.com/rapidsai/cudf/pull/15623)) [@er-eis](https://github.com/er-eis)
+- Construct `pylibcudf` columns from objects supporting `__cuda_array_interface__` ([#15615](https://github.com/rapidsai/cudf/pull/15615)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Expose some Parquet per-column configuration options via the python API ([#15613](https://github.com/rapidsai/cudf/pull/15613)) [@etseidl](https://github.com/etseidl)
+- Migrate string `find` operations to `pylibcudf` ([#15604](https://github.com/rapidsai/cudf/pull/15604)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Round trip FIXED_LEN_BYTE_ARRAY data properly in Parquet writer ([#15600](https://github.com/rapidsai/cudf/pull/15600)) [@etseidl](https://github.com/etseidl)
+- Reading multi-line JSON in string columns using runtime configurable delimiter ([#15556](https://github.com/rapidsai/cudf/pull/15556)) [@shrshi](https://github.com/shrshi)
+- Remove public gtest dependency from libcudf conda package ([#15534](https://github.com/rapidsai/cudf/pull/15534)) [@robertmaynard](https://github.com/robertmaynard)
+- Fea/move to latest nanoarrow ([#15526](https://github.com/rapidsai/cudf/pull/15526)) [@robertmaynard](https://github.com/robertmaynard)
+- Migrate string `case` operations to `pylibcudf` ([#15489](https://github.com/rapidsai/cudf/pull/15489)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add Parquet encoding statistics to column chunk metadata ([#15452](https://github.com/rapidsai/cudf/pull/15452)) [@etseidl](https://github.com/etseidl)
+- Implement JNI for chunked ORC reader ([#15446](https://github.com/rapidsai/cudf/pull/15446)) [@ttnghia](https://github.com/ttnghia)
+- Add some missing optional fields to the Parquet RowGroup metadata ([#15421](https://github.com/rapidsai/cudf/pull/15421)) [@etseidl](https://github.com/etseidl)
+- Adding parquet transcoding example ([#15420](https://github.com/rapidsai/cudf/pull/15420)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add fields to Parquet Statistics structure that were added in parquet-format 2.10 ([#15412](https://github.com/rapidsai/cudf/pull/15412)) [@etseidl](https://github.com/etseidl)
+- Add option to Parquet writer to skip compressing individual columns ([#15411](https://github.com/rapidsai/cudf/pull/15411)) [@etseidl](https://github.com/etseidl)
+- Add BYTE_STREAM_SPLIT support to Parquet ([#15311](https://github.com/rapidsai/cudf/pull/15311)) [@etseidl](https://github.com/etseidl)
+- Introduce benchmark suite for JSON reader options ([#15124](https://github.com/rapidsai/cudf/pull/15124)) [@shrshi](https://github.com/shrshi)
+- Implement ORC chunked reader ([#15094](https://github.com/rapidsai/cudf/pull/15094)) [@ttnghia](https://github.com/ttnghia)
+- Extend cudf devcontainers to specify jitify2 kernel cache ([#15068](https://github.com/rapidsai/cudf/pull/15068)) [@robertmaynard](https://github.com/robertmaynard)
+- Add `to_arrow_device` function to cudf interop using nanoarrow ([#15047](https://github.com/rapidsai/cudf/pull/15047)) [@zeroshade](https://github.com/zeroshade)
+- Add JSON option to prune columns ([#14996](https://github.com/rapidsai/cudf/pull/14996)) [@karthikeyann](https://github.com/karthikeyann)
+
+## 🛠️ Improvements
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Deprecate `divisions=&#39;quantile&#39;` support in `set_index` ([#15804](https://github.com/rapidsai/cudf/pull/15804)) [@rjzamora](https://github.com/rjzamora)
+- Improve performance of Series.to_numpy/to_cupy ([#15792](https://github.com/rapidsai/cudf/pull/15792)) [@mroeschke](https://github.com/mroeschke)
+- Access `self.index` instead of `self._index` where possible ([#15781](https://github.com/rapidsai/cudf/pull/15781)) [@mroeschke](https://github.com/mroeschke)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Avoid index-to-column conversion in some DataFrame ops ([#15763](https://github.com/rapidsai/cudf/pull/15763)) [@mroeschke](https://github.com/mroeschke)
+- Fix `chunked_parquet_reader` behavior when input has no more rows to read ([#15757](https://github.com/rapidsai/cudf/pull/15757)) [@mhaseeb123](https://github.com/mhaseeb123)
+- [JNI] Expose java API for cudf::io::config_host_memory_resource ([#15745](https://github.com/rapidsai/cudf/pull/15745)) [@abellina](https://github.com/abellina)
+- Migrate all cpp pxd files into pylibcudf ([#15740](https://github.com/rapidsai/cudf/pull/15740)) [@vyasr](https://github.com/vyasr)
+- Validate and materialize iterators earlier in as_column ([#15739](https://github.com/rapidsai/cudf/pull/15739)) [@mroeschke](https://github.com/mroeschke)
+- Push some as_column arrow logic to ColumnBase.from_arrow ([#15738](https://github.com/rapidsai/cudf/pull/15738)) [@mroeschke](https://github.com/mroeschke)
+- Expose stream parameter in public reduction APIs ([#15737](https://github.com/rapidsai/cudf/pull/15737)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- remove unnecessary &#39;setuptools&#39; host dependency, simplify dependencies.yaml ([#15736](https://github.com/rapidsai/cudf/pull/15736)) [@jameslamb](https://github.com/jameslamb)
+- Defer to C++ equality and hashing for pylibcudf DataType and Aggregation objects ([#15732](https://github.com/rapidsai/cudf/pull/15732)) [@wence-](https://github.com/wence-)
+- Implement null-aware NOT_EQUALS binop ([#15731](https://github.com/rapidsai/cudf/pull/15731)) [@wence-](https://github.com/wence-)
+- Fix split-record result list column offset type ([#15707](https://github.com/rapidsai/cudf/pull/15707)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` to `16` ([#15703](https://github.com/rapidsai/cudf/pull/15703)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove experimental namespace from make_strings_children ([#15702](https://github.com/rapidsai/cudf/pull/15702)) [@davidwendt](https://github.com/davidwendt)
+- Rework get_json_object benchmark to use nvbench ([#15698](https://github.com/rapidsai/cudf/pull/15698)) [@davidwendt](https://github.com/davidwendt)
+- Rework some python tests of Parquet delta encodings ([#15693](https://github.com/rapidsai/cudf/pull/15693)) [@etseidl](https://github.com/etseidl)
+- Skeleton cudf polars package ([#15688](https://github.com/rapidsai/cudf/pull/15688)) [@wence-](https://github.com/wence-)
+- Upgrade pre commit hooks ([#15685](https://github.com/rapidsai/cudf/pull/15685)) [@wence-](https://github.com/wence-)
+- Allow `fillna` to validate for `CategoricalColumn.fillna` ([#15683](https://github.com/rapidsai/cudf/pull/15683)) [@galipremsagar](https://github.com/galipremsagar)
+- Misc Column cleanups ([#15682](https://github.com/rapidsai/cudf/pull/15682)) [@mroeschke](https://github.com/mroeschke)
+- Reducing runtime of JSON reader options benchmark ([#15681](https://github.com/rapidsai/cudf/pull/15681)) [@shrshi](https://github.com/shrshi)
+- Add `Timestamp` and `Timedelta` proxy types ([#15680](https://github.com/rapidsai/cudf/pull/15680)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove host_parse_nested_json. ([#15674](https://github.com/rapidsai/cudf/pull/15674)) [@bdice](https://github.com/bdice)
+- Reduce runtime for ParquetChunkedReaderInputLimitTest gtests ([#15672](https://github.com/rapidsai/cudf/pull/15672)) [@davidwendt](https://github.com/davidwendt)
+- Add large-strings gtest for cudf::interleave_columns ([#15669](https://github.com/rapidsai/cudf/pull/15669)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for multi-replace_re ([#15667](https://github.com/rapidsai/cudf/pull/15667)) [@davidwendt](https://github.com/davidwendt)
+- Enabled `Holiday` types in `cudf.pandas` ([#15664](https://github.com/rapidsai/cudf/pull/15664)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove obsolete `XFAIL` markers for query-planning ([#15662](https://github.com/rapidsai/cudf/pull/15662)) [@rjzamora](https://github.com/rjzamora)
+- Clean up join benchmarks ([#15644](https://github.com/rapidsai/cudf/pull/15644)) [@PointKernel](https://github.com/PointKernel)
+- Enable warnings as errors in custreamz ([#15642](https://github.com/rapidsai/cudf/pull/15642)) [@mroeschke](https://github.com/mroeschke)
+- Improve distinct join with set `retrieve` ([#15636](https://github.com/rapidsai/cudf/pull/15636)) [@PointKernel](https://github.com/PointKernel)
+- Fix -Werror=type-limits. ([#15635](https://github.com/rapidsai/cudf/pull/15635)) [@bdice](https://github.com/bdice)
+- Enable FutureWarnings/DeprecationWarnings as errors for dask_cudf ([#15634](https://github.com/rapidsai/cudf/pull/15634)) [@mroeschke](https://github.com/mroeschke)
+- Remove NVBench SHA override. ([#15633](https://github.com/rapidsai/cudf/pull/15633)) [@alliepiper](https://github.com/alliepiper)
+- Add support for large string columns to Parquet reader and writer ([#15632](https://github.com/rapidsai/cudf/pull/15632)) [@etseidl](https://github.com/etseidl)
+- Large strings support in MD5 and SHA hashers ([#15631](https://github.com/rapidsai/cudf/pull/15631)) [@davidwendt](https://github.com/davidwendt)
+- Fix make_offsets_child_column usage in cudf::strings::detail::shift ([#15630](https://github.com/rapidsai/cudf/pull/15630)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings convert ([#15629](https://github.com/rapidsai/cudf/pull/15629)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15627](https://github.com/rapidsai/cudf/pull/15627)) [@bdice](https://github.com/bdice)
+- Avoid accessing attributes via `_column` if not needed ([#15624](https://github.com/rapidsai/cudf/pull/15624)) [@mroeschke](https://github.com/mroeschke)
+- Make ColumnBase.__cuda_array_interface__ opt out instead of opt in ([#15622](https://github.com/rapidsai/cudf/pull/15622)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::gather ([#15621](https://github.com/rapidsai/cudf/pull/15621)) [@davidwendt](https://github.com/davidwendt)
+- Remove jni-docker-build workflow ([#15619](https://github.com/rapidsai/cudf/pull/15619)) [@bdice](https://github.com/bdice)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Centos7 support ([#15608](https://github.com/rapidsai/cudf/pull/15608)) [@NvTimLiu](https://github.com/NvTimLiu)
+- Use experimental make_strings_children for json/csv writers ([#15599](https://github.com/rapidsai/cudf/pull/15599)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings join/url_encode/slice ([#15598](https://github.com/rapidsai/cudf/pull/15598)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children in nvtext APIs ([#15595](https://github.com/rapidsai/cudf/pull/15595)) [@davidwendt](https://github.com/davidwendt)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#15594](https://github.com/rapidsai/cudf/pull/15594)) [@hcho3](https://github.com/hcho3)
+- Deprecate `to/from_dask_dataframe` APIs in dask-cudf ([#15592](https://github.com/rapidsai/cudf/pull/15592)) [@rjzamora](https://github.com/rjzamora)
+- Minor fixups for future NumPy 2 compatibility ([#15590](https://github.com/rapidsai/cudf/pull/15590)) [@seberg](https://github.com/seberg)
+- Delay materializing RangeIndex in .reset_index ([#15588](https://github.com/rapidsai/cudf/pull/15588)) [@mroeschke](https://github.com/mroeschke)
+- Use experimental make_strings_children for capitalize/case/pad functions ([#15587](https://github.com/rapidsai/cudf/pull/15587)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings replace/filter/translate ([#15586](https://github.com/rapidsai/cudf/pull/15586)) [@davidwendt](https://github.com/davidwendt)
+- Add multithreaded parquet reader benchmarks. ([#15585](https://github.com/rapidsai/cudf/pull/15585)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Don&#39;t materialize column during RangeIndex methods ([#15582](https://github.com/rapidsai/cudf/pull/15582)) [@mroeschke](https://github.com/mroeschke)
+- Improve performance for cudf::strings::count_re ([#15578](https://github.com/rapidsai/cudf/pull/15578)) [@davidwendt](https://github.com/davidwendt)
+- Replace RangeIndex._start/_stop/_step with _range ([#15576](https://github.com/rapidsai/cudf/pull/15576)) [@mroeschke](https://github.com/mroeschke)
+- add --rm and --name to devcontainer run args ([#15572](https://github.com/rapidsai/cudf/pull/15572)) [@trxcllnt](https://github.com/trxcllnt)
+- Change the default dictionary policy in Parquet writer from `ALWAYS` to `ADAPTIVE` ([#15570](https://github.com/rapidsai/cudf/pull/15570)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Rename experimental JSON tests. ([#15568](https://github.com/rapidsai/cudf/pull/15568)) [@bdice](https://github.com/bdice)
+- Refactor JNI native dependency loading to allow returning of library path ([#15566](https://github.com/rapidsai/cudf/pull/15566)) [@jlowe](https://github.com/jlowe)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Deprecate legacy JSON reader options. ([#15558](https://github.com/rapidsai/cudf/pull/15558)) [@bdice](https://github.com/bdice)
+- Use same .clang-format in cuDF JNI ([#15557](https://github.com/rapidsai/cudf/pull/15557)) [@bdice](https://github.com/bdice)
+- Large strings support for cudf::fill ([#15555](https://github.com/rapidsai/cudf/pull/15555)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade upper bound pinning to `pandas-2.2.2` ([#15554](https://github.com/rapidsai/cudf/pull/15554)) [@galipremsagar](https://github.com/galipremsagar)
+- Work around issues with cccl main ([#15552](https://github.com/rapidsai/cudf/pull/15552)) [@miscco](https://github.com/miscco)
+- Enable pandas plotting unit tests for cudf.pandas ([#15547](https://github.com/rapidsai/cudf/pull/15547)) [@mroeschke](https://github.com/mroeschke)
+- Move timezone conversion logic to `DatetimeColumn` ([#15545](https://github.com/rapidsai/cudf/pull/15545)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::interleave_columns ([#15544](https://github.com/rapidsai/cudf/pull/15544)) [@davidwendt](https://github.com/davidwendt)
+- [skip ci] Switch back to 24.06 branch for pandas tests ([#15543](https://github.com/rapidsai/cudf/pull/15543)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove checks dependency from static-configure test job. ([#15542](https://github.com/rapidsai/cudf/pull/15542)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Enable more ignored pandas unit tests for cudf.pandas ([#15535](https://github.com/rapidsai/cudf/pull/15535)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::clamp ([#15533](https://github.com/rapidsai/cudf/pull/15533)) [@davidwendt](https://github.com/davidwendt)
+- Remove version hard-coding ([#15529](https://github.com/rapidsai/cudf/pull/15529)) [@galipremsagar](https://github.com/galipremsagar)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Make some private class properties not settable ([#15527](https://github.com/rapidsai/cudf/pull/15527)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in regex replace APIs ([#15524](https://github.com/rapidsai/cudf/pull/15524)) [@davidwendt](https://github.com/davidwendt)
+- Skip pandas unit tests that crash pytest workers in `cudf.pandas` ([#15521](https://github.com/rapidsai/cudf/pull/15521)) [@mroeschke](https://github.com/mroeschke)
+- Preserve column metadata during more DataFrame operations ([#15519](https://github.com/rapidsai/cudf/pull/15519)) [@mroeschke](https://github.com/mroeschke)
+- Move to pandas-tests to a dedicated workflow file and trigger it from branch.yaml ([#15516](https://github.com/rapidsai/cudf/pull/15516)) [@galipremsagar](https://github.com/galipremsagar)
+- Large strings gtest fixture and utilities ([#15513](https://github.com/rapidsai/cudf/pull/15513)) [@davidwendt](https://github.com/davidwendt)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Relax protobuf lower bound to 3.20. ([#15506](https://github.com/rapidsai/cudf/pull/15506)) [@bdice](https://github.com/bdice)
+- Clean up index methods ([#15496](https://github.com/rapidsai/cudf/pull/15496)) [@mroeschke](https://github.com/mroeschke)
+- Update strings contains benchmarks to nvbench ([#15495](https://github.com/rapidsai/cudf/pull/15495)) [@davidwendt](https://github.com/davidwendt)
+- Update NVBench fixture to use new hooks, fix pinned memory segfault. ([#15492](https://github.com/rapidsai/cudf/pull/15492)) [@alliepiper](https://github.com/alliepiper)
+- Enable tests/scalar and test/series in cudf.pandas tests ([#15486](https://github.com/rapidsai/cudf/pull/15486)) [@mroeschke](https://github.com/mroeschke)
+- Clean up __cuda_array_interface__ handling in as_column ([#15477](https://github.com/rapidsai/cudf/pull/15477)) [@mroeschke](https://github.com/mroeschke)
+- Avoid .ordered and .categories from being settable in CategoricalColumn and CategoricalDtype ([#15475](https://github.com/rapidsai/cudf/pull/15475)) [@mroeschke](https://github.com/mroeschke)
+- Ignore pandas tests for cudf.pandas that need motoserver ([#15468](https://github.com/rapidsai/cudf/pull/15468)) [@mroeschke](https://github.com/mroeschke)
+- Use cached_property for NumericColumn.nan_count instead of ._nan_count variable ([#15466](https://github.com/rapidsai/cudf/pull/15466)) [@mroeschke](https://github.com/mroeschke)
+- Add to_arrow_device() functions that accept views ([#15465](https://github.com/rapidsai/cudf/pull/15465)) [@davidwendt](https://github.com/davidwendt)
+- Add custom status check workflow ([#15464](https://github.com/rapidsai/cudf/pull/15464)) [@galipremsagar](https://github.com/galipremsagar)
+- Disable pandas 2.x clipboard tests in cudf.pandas tests ([#15462](https://github.com/rapidsai/cudf/pull/15462)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/strings/test_api.py and tests/io/pytables in cudf.pandas tests ([#15461](https://github.com/rapidsai/cudf/pull/15461)) [@mroeschke](https://github.com/mroeschke)
+- Enable test_parsing in cudf.pandas tests ([#15460](https://github.com/rapidsai/cudf/pull/15460)) [@mroeschke](https://github.com/mroeschke)
+- Add `from_arrow_device` function to cudf interop using nanoarrow ([#15458](https://github.com/rapidsai/cudf/pull/15458)) [@zeroshade](https://github.com/zeroshade)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Enable tests/windows/ in cudf.pandas tests ([#15444](https://github.com/rapidsai/cudf/pull/15444)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/interchange/test_impl.py in cudf.pandas tests ([#15443](https://github.com/rapidsai/cudf/pull/15443)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/io/test_user_agent.py in cudf pandas tests ([#15442](https://github.com/rapidsai/cudf/pull/15442)) [@mroeschke](https://github.com/mroeschke)
+- Performance improvement in libcudf case conversion for long strings ([#15441](https://github.com/rapidsai/cudf/pull/15441)) [@davidwendt](https://github.com/davidwendt)
+- Remove prior test skipping in run-pandas-tests with testing 2.2.1 ([#15440](https://github.com/rapidsai/cudf/pull/15440)) [@mroeschke](https://github.com/mroeschke)
+- Support orc and text IO with dask-expr using legacy conversion ([#15439](https://github.com/rapidsai/cudf/pull/15439)) [@rjzamora](https://github.com/rjzamora)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Unify Copy-On-Write and Spilling ([#15436](https://github.com/rapidsai/cudf/pull/15436)) [@madsbk](https://github.com/madsbk)
+- Enable ``dask_cudf`` json and s3 tests with query-planning on ([#15408](https://github.com/rapidsai/cudf/pull/15408)) [@rjzamora](https://github.com/rjzamora)
+- Bump ruff and codespell pre-commit checks ([#15407](https://github.com/rapidsai/cudf/pull/15407)) [@mroeschke](https://github.com/mroeschke)
+- Enable all tests for `arm` arch ([#15402](https://github.com/rapidsai/cudf/pull/15402)) [@galipremsagar](https://github.com/galipremsagar)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Optimizing multi-source byte range reading in JSON reader ([#15396](https://github.com/rapidsai/cudf/pull/15396)) [@shrshi](https://github.com/shrshi)
+- add correct labels to pandas_function_request.md ([#15381](https://github.com/rapidsai/cudf/pull/15381)) [@raybellwaves](https://github.com/raybellwaves)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Large strings support in cudf::merge ([#15374](https://github.com/rapidsai/cudf/pull/15374)) [@davidwendt](https://github.com/davidwendt)
+- Enable test-reporting for pandas pytests in CI ([#15369](https://github.com/rapidsai/cudf/pull/15369)) [@galipremsagar](https://github.com/galipremsagar)
+- Use logical types in Parquet reader ([#15365](https://github.com/rapidsai/cudf/pull/15365)) [@etseidl](https://github.com/etseidl)
+- Add experimental make_strings_children utility ([#15363](https://github.com/rapidsai/cudf/pull/15363)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15349](https://github.com/rapidsai/cudf/pull/15349)) [@bdice](https://github.com/bdice)
+- Fix CMake files in libcudf C++ examples to use existing libcudf build if present ([#15348](https://github.com/rapidsai/cudf/pull/15348)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Use ruff pydocstyle over pydocstyle pre-commit hook ([#15345](https://github.com/rapidsai/cudf/pull/15345)) [@mroeschke](https://github.com/mroeschke)
+- Refactor stream mode setup for gtests ([#15337](https://github.com/rapidsai/cudf/pull/15337)) [@davidwendt](https://github.com/davidwendt)
+- Benchmark decimal &lt;--&gt; floating conversions. ([#15334](https://github.com/rapidsai/cudf/pull/15334)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Avoid duplicate dask-cudf testing ([#15333](https://github.com/rapidsai/cudf/pull/15333)) [@rjzamora](https://github.com/rjzamora)
+- Skip decode steps in Parquet reader when nullable columns have no nulls ([#15332](https://github.com/rapidsai/cudf/pull/15332)) [@etseidl](https://github.com/etseidl)
+- Update udf_cpp to use rapids_cpm_cccl. ([#15331](https://github.com/rapidsai/cudf/pull/15331)) [@bdice](https://github.com/bdice)
+- Forward-merge branch-24.04 into branch-24.06 [skip ci] ([#15330](https://github.com/rapidsai/cudf/pull/15330)) [@rapids-bot[bot]](https://github.com/rapids-bot[bot])
+- Allow ``numeric_only=True`` for simple groupby reductions ([#15326](https://github.com/rapidsai/cudf/pull/15326)) [@rjzamora](https://github.com/rjzamora)
+- Drop CentOS 7 support. ([#15323](https://github.com/rapidsai/cudf/pull/15323)) [@bdice](https://github.com/bdice)
+- Rework cudf::find_and_replace_all to use gather-based make_strings_column ([#15305](https://github.com/rapidsai/cudf/pull/15305)) [@davidwendt](https://github.com/davidwendt)
+- First pass at adding testing for pylibcudf ([#15300](https://github.com/rapidsai/cudf/pull/15300)) [@vyasr](https://github.com/vyasr)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Rework cudf::replace_nulls to use strings::detail::copy_if_else ([#15286](https://github.com/rapidsai/cudf/pull/15286)) [@davidwendt](https://github.com/davidwendt)
+- Clean up special casing in `as_column` for non-typed input ([#15276](https://github.com/rapidsai/cudf/pull/15276)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in cudf::concatenate ([#15195](https://github.com/rapidsai/cudf/pull/15195)) [@davidwendt](https://github.com/davidwendt)
+- Use less _is_categorical_dtype ([#15148](https://github.com/rapidsai/cudf/pull/15148)) [@mroeschke](https://github.com/mroeschke)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+- `ModuleAccelerator` performance: cache the result of checking if a caller is in the denylist ([#15056](https://github.com/rapidsai/cudf/pull/15056)) [@shwina](https://github.com/shwina)
+- Use offsetalator in cudf::strings::replace functions ([#14824](https://github.com/rapidsai/cudf/pull/14824)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup some timedelta/datetime column logic ([#14715](https://github.com/rapidsai/cudf/pull/14715)) [@mroeschke](https://github.com/mroeschke)
+- Refactor numpy array input in as_column ([#14651](https://github.com/rapidsai/cudf/pull/14651)) [@mroeschke](https://github.com/mroeschke)
+- Refactor joins for conditional semis and antis ([#14646](https://github.com/rapidsai/cudf/pull/14646)) [@DanialJavady96](https://github.com/DanialJavady96)
+- Eagerly populate the class dict for cudf.pandas proxy types ([#14534](https://github.com/rapidsai/cudf/pull/14534)) [@shwina](https://github.com/shwina)
+- Some additional kernel thread index refactoring. ([#14107](https://github.com/rapidsai/cudf/pull/14107)) [@bdice](https://github.com/bdice)
+
 # cuDF 24.04.00 (10 Apr 2024)
 
 ## 🚨 Breaking Changes
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e7f7a20e307..98c2ec0a22e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
@@ -161,6 +161,8 @@ To build all libraries and tests, with Python packages in development mode, simp
 ./build.sh --pydevelop libcudf libcudf_kafka cudf dask_cudf cudf_kafka custreamz
 ```
 
+- **Note**: if Cython files (`*.pyx` or `*.pxd`) have changed, the Python build must be rerun.
+
 To run the C++ tests, run
 
 ```bash
@@ -217,7 +219,7 @@ cuda-gdb -ex r --args python <program_name>.py <program_arguments>
 ```
 
 ```bash
-cuda-memcheck python <program_name>.py <program_arguments>
+compute-sanitizer --tool memcheck python <program_name>.py <program_arguments>
 ```
 
 ### Device debug symbols
diff --git a/README.md b/README.md
index 8f9e57ff3ad..75ee405bc1f 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,8 @@ You can import `cudf` directly and use it like `pandas`:
 
 ```python
 import cudf
-import requests
-from io import StringIO
 
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
-
-tips_df = cudf.read_csv(StringIO(content))
+tips_df = cudf.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
@@ -36,13 +31,8 @@ supported operations and falling back to pandas when needed:
 %load_ext cudf.pandas  # pandas operations now use the GPU!
 
 import pandas as pd
-import requests
-from io import StringIO
-
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
 
-tips_df = pd.read_csv(StringIO(content))
+tips_df = pd.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
@@ -93,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.04 python=3.11 cuda-version=12.2
+    cudf=24.06 python=3.11 cuda-version=12.2
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 1f534289510..0bff6981a3d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.01
+24.06.00
diff --git a/build.sh b/build.sh
index e5daf2f3451..43bb04f7a18 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -109,8 +109,8 @@ function buildAll {
 }
 
 function buildLibCudfJniInDocker {
-    local cudaVersion="11.5.0"
-    local imageName="cudf-build:${cudaVersion}-devel-centos7"
+    local cudaVersion="11.8.0"
+    local imageName="cudf-build:${cudaVersion}-devel-rocky8"
     local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
     local workspaceDir="/rapids"
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
@@ -120,7 +120,7 @@ function buildLibCudfJniInDocker {
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
     mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
-        -f java/ci/Dockerfile.centos7 \
+        -f java/ci/Dockerfile.rocky \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 668d52e530b..db306046667 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -46,11 +46,9 @@ pushd docs/cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
 rapids-logger "Build dask-cuDF Sphinx docs"
@@ -58,11 +56,9 @@ pushd docs/dask_cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
 rapids-upload-docs
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index cde22bb70d1..f0886a28fd9 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -13,4 +13,4 @@ python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
new file mode 100755
index 00000000000..11d5585d98f
--- /dev/null
+++ b/ci/configure_cpp_static.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-date-string
+
+rapids-logger "Configure static cpp build"
+
+ENV_YAML_DIR="$(mktemp -d)"
+REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file_key test_static_build \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
+
+python -m pip install -r "${REQUIREMENTS_FILE}"
+pyenv rehash
+
+cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 37adabdb9c6..6cf70a2347f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -7,18 +7,32 @@
 # branch and the PR branch:
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.pr-results.json
-aws s3 cp $MAIN_ARTIFACT main-results.json
+GH_JOB_NAME="pandas-tests-diff / build"
+RAPIDS_FULL_VERSION=$(<./VERSION)
+rapids-logger "Github job name: ${GH_JOB_NAME}"
+rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
+
+PY_VER="39"
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
+
+rapids-logger "Fetching latest available results from nightly"
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text  | tee s3_output.txt
+COMPARE_ENV=$(tail -n 1 s3_output.txt)
+rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
+
+aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
 aws s3 cp $PR_ARTIFACT pr-results.json
 
 # Compute the diff and prepare job summary:
 python -m pip install pandas tabulate
 python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY"
 
-COMMENT=$(head -1 summary.txt)
-
+COMMENT=$(head -1 summary.txt | grep -oP '\d+/\d+ \(\d+\.\d+%\).*?(a decrease by|an increase by) \d+\.\d+%')
 echo "$COMMENT"
-
-# Magic name that the custom-job.yaml workflow reads and re-exports
-echo "job_output=${COMMENT}" >> "${GITHUB_OUTPUT}"
+jq --arg COMMENT "$COMMENT" --arg GH_JOB_NAME "$GH_JOB_NAME" -n \
+  '{"context": "Pandas tests",
+    "description": $COMMENT,
+    "state":"success",
+    "job_name": $GH_JOB_NAME}' \
+    > gh-status.json
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 1e83e51ab04..93a815838b7 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -40,7 +40,7 @@ def get_total_and_passed(results):
     "Merging this PR would result in "
     f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) "
     "Pandas tests passing, "
-    f"{rate_change_type} in the test pass rate by "
+    f"{rate_change_type} by "
     f"{pass_rate_change:.2f}%. "
     f"Trunk stats: {main_passed}/{main_total}."
 )
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index f3c37ecde26..abde5e5d160 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -6,25 +6,12 @@
 set -euo pipefail
 
 PANDAS_TESTS_BRANCH=${1}
-
-rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
+RAPIDS_FULL_VERSION=$(<./VERSION)
+rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
@@ -40,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
 # summarize the results and save them to artifacts:
-python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
-mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
-rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
+rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 4f1e4bbf993..78945d37f22 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -31,21 +31,8 @@ done
 if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
-    # Set the manylinux version used for downloading the wheels so that we test the
-    # newer ABI wheels on the newer images that support their installation.
-    # Need to disable pipefail for the head not to fail, see
-    # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-    set +o pipefail
-    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-    set -o pipefail
-    manylinux_version="2_17"
-    if [[ ${glibc_minor_version} -ge 28 ]]; then
-        manylinux_version="2_28"
-    fi
-    manylinux="manylinux_${manylinux_version}"
-
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7cacdfd39c3..beeb130f0f1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -70,7 +70,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
@@ -88,4 +88,5 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
new file mode 100755
index 00000000000..0819eacf636
--- /dev/null
+++ b/ci/run_cudf_examples.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -uo pipefail
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+
+# Support customizing the examples' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
+
+# compute-sanitizer not available before CUDA 11.6
+if [[ "${RAPIDS_CUDA_VERSION%.*}" < "11.6" ]]; then
+  echo "computer-sanitizer unavailable pre 11.6"
+  exit 0
+fi
+
+compute-sanitizer --tool memcheck basic_example
+
+compute-sanitizer --tool memcheck deduplication
+
+compute-sanitizer --tool memcheck custom_optimized names.csv
+compute-sanitizer --tool memcheck custom_prealloc names.csv
+compute-sanitizer --tool memcheck custom_with_malloc names.csv
+
+compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
+
+exit ${EXITCODE}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 995c8d7d71f..7865849bb74 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -17,6 +17,12 @@ rapids-logger "Run libcudf gtests"
 ./ci/run_cudf_ctests.sh -j20
 SUITEERROR=$?
 
+if (( ${SUITEERROR} == 0 )); then
+    rapids-logger "Run libcudf examples"
+    ./ci/run_cudf_examples.sh
+    SUITEERROR=$?
+fi
+
 if (( ${SUITEERROR} == 0 )); then
     rapids-logger "Run libcudf_kafka gtests"
     ./ci/run_cudf_kafka_ctests.sh -j20
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
old mode 100644
new mode 100755
index e1b2a367187..da847137a2b
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -31,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests
+  libcudf libcudf_kafka libcudf-tests libcudf-example
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index bacb54b3896..217dd2fd9a8 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -14,6 +14,14 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
 rapids-logger "pytest cudf"
 ./ci/run_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 8ecd02f70a1..cbc1dc1cb87 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -19,8 +19,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf"
-./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf (dask-expr)"
+DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -29,10 +29,9 @@ rapids-logger "pytest dask_cudf"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+rapids-logger "pytest dask_cudf (legacy)"
+DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   --dist=loadscope \
   .
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index af5779f478a..fdb61278d36 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,21 +3,8 @@
 
 set -eou pipefail
 
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
@@ -26,18 +13,21 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
-    rapids-logger "Run smoke tests for cudf"
-    python ./ci/wheel_smoke_test_cudf.py
-else
-    rapids-logger "pytest cudf"
-    pushd python/cudf/cudf/tests
-    python -m pytest \
-      --cache-clear \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
-      --numprocesses=8 \
-      --dist=worksteal \
-      .
-    popd
-fi
+
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
+rapids-logger "pytest cudf"
+pushd python/cudf/cudf/tests
+python -m pytest \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
+  --numprocesses=8 \
+  --dist=worksteal \
+  .
+popd
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 398eed43ea4..2b20b9d9ce4 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,20 +7,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
@@ -31,19 +18,19 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf"
+rapids-logger "pytest dask_cudf (dask-expr)"
 pushd python/dask_cudf/dask_cudf
-python -m pytest \
+DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   .
 popd
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
+# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
+rapids-logger "pytest dask_cudf (legacy)"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   .
 popd
diff --git a/ci/wheel_smoke_test_cudf.py b/ci/wheel_smoke_test_cudf.py
deleted file mode 100644
index a11a97039af..00000000000
--- a/ci/wheel_smoke_test_cudf.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-import cudf
-import pyarrow as pa
-
-if __name__ == '__main__':
-    n_legs = pa.array([2, 4, 5, 100])
-    animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"])
-    names = ["n_legs", "animals"]
-    foo = pa.table([n_legs, animals], names=names)
-    df = cudf.DataFrame.from_arrow(foo)
-    assert df.loc[df["animals"] == "Centipede"]["n_legs"].iloc[0] == 100
-    assert df.loc[df["animals"] == "Flamingo"]["n_legs"].iloc[0] == 2
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index eb4eca1cb12..804b09bab59 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -27,29 +26,27 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.4.*
-- libparquet==14.0.2.*
+- libkvikio==24.6.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.4.*
+- librmm==24.6.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -64,13 +61,12 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - ptxcompiler
-- pyarrow==14.0.2.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
@@ -80,9 +76,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - rich
-- rmm==24.4.*
+- rmm==24.6.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
@@ -96,7 +92,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index b1b41f41803..89eac98f652 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -28,27 +27,25 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.4.*
-- libparquet==14.0.2.*
+- libkvikio==24.6.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.4.*
+- librmm==24.6.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -62,12 +59,11 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
-- pyarrow==14.0.2.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
@@ -78,9 +74,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - rich
-- rmm==24.4.*
+- rmm==24.6.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
@@ -94,7 +90,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index cd9237bd7cb..e7245e67659 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,16 +57,14 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
-    - protobuf ==4.24.*
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
-    - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==14.0.2.*
+    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -78,14 +76,13 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.2dev0
+    - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
     - {{ pin_compatible('numpy', max_pin='x') }}
-    - {{ pin_compatible('pyarrow', max_pin='x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 45e41bf8de7..4d91cf6320c 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
@@ -61,7 +61,6 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - scikit-build-core >=0.7.0
-    - setuptools
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 53770956ebe..c01178bf732 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -10,20 +10,17 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
   - ">=3.26.4"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 libarrow_version:
-  - "==14.0.2"
+  - "==16.1.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh
index e249688a03b..1a52dec99e3 100644
--- a/conda/recipes/libcudf/install_libcudf_example.sh
+++ b/conda/recipes/libcudf/install_libcudf_example.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-./cpp/examples/build.sh
+# build and install libcudf examples
+./cpp/examples/build.sh --install
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 63eb83084dd..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - librmm ={{ minor_version }}
     - libkvikio ={{ minor_version }}
@@ -69,9 +69,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - benchmark {{ gbench_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
 
 outputs:
@@ -108,8 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
@@ -175,7 +170,7 @@ outputs:
         {% endif %}
         - cuda-version ={{ cuda_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libcudf', exact=True) }}
         {% if cuda_major == "11" %}
@@ -195,7 +190,7 @@ outputs:
       license: Apache-2.0
       license_family: APACHE
       license_file: LICENSE
-      summary: libcudf_example library
+      summary: libcudf example executables
   - name: libcudf-tests
     version: {{ version }}
     script: install_libcudf_tests.sh
@@ -221,9 +216,6 @@ outputs:
         {% else %}
         - libcurand-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('libcudf', exact=True) }}
@@ -233,9 +225,6 @@ outputs:
         {% else %}
         - libcurand
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12837c69e59..1eab51c8827 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -192,6 +192,8 @@ include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 # find arrow
 include(cmake/thirdparty/get_arrow.cmake)
+# find flatbuffers
+include(cmake/thirdparty/get_flatbuffers.cmake)
 # find dlpack
 include(cmake/thirdparty/get_dlpack.cmake)
 # find cuCollections, should come after including CCCL
@@ -210,12 +212,14 @@ include(cmake/thirdparty/get_kvikio.cmake)
 include(cmake/thirdparty/get_fmt.cmake)
 # find spdlog
 include(cmake/thirdparty/get_spdlog.cmake)
+# find nanoarrow
+include(cmake/thirdparty/get_nanoarrow.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
   include("${rapids-cmake-dir}/export/find_package_file.cmake")
   list(APPEND METADATA_KINDS BUILD INSTALL)
-  list(APPEND dependencies KvikIO ZLIB nvcomp)
+  list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow)
   if(TARGET cufile::cuFile_interface)
     list(APPEND dependencies cuFile)
   endif()
@@ -260,6 +264,7 @@ add_library(
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
+  src/binaryop/compiled/NullNotEquals.cu
   src/binaryop/compiled/NullLogicalAnd.cu
   src/binaryop/compiled/NullLogicalOr.cu
   src/binaryop/compiled/NullMax.cu
@@ -344,7 +349,6 @@ add_library(
   src/groupby/sort/group_replace_nulls.cu
   src/groupby/sort/group_sum_scan.cu
   src/groupby/sort/sort_helper.cu
-  src/hash/hashing.cu
   src/hash/md5_hash.cu
   src/hash/murmurhash3_x86_32.cu
   src/hash/murmurhash3_x64_128.cu
@@ -353,11 +357,14 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
-  src/hash/spark_murmurhash3_x86_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
+  src/interop/to_arrow_device.cu
+  src/interop/from_arrow_device.cu
+  src/interop/to_arrow_schema.cpp
+  src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
@@ -391,8 +398,9 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
+  src/io/orc/reader_impl_decode.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
@@ -424,7 +432,9 @@ add_library(
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
+  src/io/utilities/base64_utilities.cpp
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
@@ -450,7 +460,6 @@ add_library(
   src/join/mixed_join_semi.cu
   src/join/mixed_join_size_kernel.cu
   src/join/mixed_join_size_kernel_nulls.cu
-  src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/json/json_path.cu
   src/lists/contains.cu
@@ -584,12 +593,14 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
   src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
+  src/strings/replace/find_replace.cu
   src/strings/replace/multi.cu
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
@@ -734,6 +745,8 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${FlatBuffers_SOURCE_DIR}/include>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -781,8 +794,8 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
-  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface>
+  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+          kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified
@@ -842,14 +855,12 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
-  # fPIC enabled and therefore can't be embedded into shared libraries.
   add_library(
-    cudftestutil STATIC
+    cudftestutil SHARED
     tests/io/metadata_utilities.cpp
-    tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
     tests/utilities/debug_utilities.cu
+    tests/utilities/random_seed.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -874,8 +885,8 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
-    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+    PUBLIC Threads::Threads cudf cudftest_default_stream
+    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
@@ -954,7 +965,7 @@ endif()
 if(CUDF_BUILD_BENCHMARKS)
   # Find or install GoogleBench
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 
   # Find or install nvbench
   include(cmake/thirdparty/get_nvbench.cmake)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index c82e475dece..10f645dfec0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3-cpp
+         cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -40,7 +40,7 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
+  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
                                synchronization/synchronization.cpp io/cuio_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
@@ -208,8 +208,9 @@ ConfigureNVBench(
 )
 
 # ##################################################################################################
-# * reduction benchmark ---------------------------------------------------------------------------
+# * replace benchmark ---------------------------------------------------------------------------
 ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp)
+ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp)
 
 # ##################################################################################################
 # * filling benchmark -----------------------------------------------------------------------------
@@ -235,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
@@ -253,6 +256,11 @@ ConfigureNVBench(
   PARQUET_READER_NVBENCH io/parquet/parquet_reader_input.cpp io/parquet/parquet_reader_options.cpp
 )
 
+# ##################################################################################################
+# * parquet multithread reader benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_multithread.cpp)
+
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
@@ -298,7 +306,6 @@ ConfigureBench(
   string/copy.cu
   string/factory.cu
   string/filter.cpp
-  string/find.cpp
   string/repeat_strings.cpp
   string/replace.cpp
   string/slice.cpp
@@ -315,6 +322,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/find.cpp
   string/gather.cpp
   string/join_strings.cpp
   string/lengths.cpp
@@ -327,9 +335,10 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH json/json.cu)
+ConfigureNVBench(JSON_NVBENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
+ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################
@@ -337,6 +346,16 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
 target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
+# ##################################################################################################
+# * decimal benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
+
+# ##################################################################################################
+# * reshape benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index a1131df4472..7086a61c7c5 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,5 +111,6 @@ BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool
 BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
 BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
 BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_NOT_EQUALS,      bool);
 BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
 BINARYOP_BENCHMARK_DEFINE(timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 9857aac4473..6df2cb44adc 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -324,10 +324,11 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   distribution_fn<DeviceType> dist;
   std::optional<numeric::scale_type> scale;
 
-  random_value_fn(distribution_params<DeviceType> const& desc)
+  random_value_fn(distribution_params<T> const& desc)
     : lower_bound{desc.lower_bound},
       upper_bound{desc.upper_bound},
-      dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
+      dist{make_distribution<DeviceType>(desc.id, lower_bound, upper_bound)},
+      scale{desc.scale}
   {
   }
 
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 31dc2673d70..68d3dc492f5 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -182,9 +182,17 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
   cudf::size_type max_depth;
 };
 
-// Present for compilation only. To be implemented once reader/writers support the fixed width type.
+/**
+ * @brief Fixed-point values are parameterized with a distribution type, scale, and bounds of the
+ * same type.
+ */
 template <typename T>
-struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
+  distribution_id id;
+  typename T::rep lower_bound;
+  typename T::rep upper_bound;
+  std::optional<numeric::scale_type> scale;
+};
 
 /**
  * @brief Returns a vector of types, corresponding to the input type or a type group.
@@ -226,7 +234,7 @@ class data_profile {
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
   distribution_params<cudf::struct_view> struct_dist_desc{
     {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
-  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
+  std::map<cudf::type_id, distribution_params<numeric::decimal128>> decimal_params;
 
   double bool_probability_true           = 0.5;
   std::optional<double> null_probability = 0.01;
@@ -300,16 +308,21 @@ class data_profile {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<typename T::rep> get_distribution_params() const
+  distribution_params<T> get_distribution_params() const
   {
     using rep = typename T::rep;
     auto it   = decimal_params.find(cudf::type_to_id<T>());
     if (it == decimal_params.end()) {
       auto const range = default_range<rep>();
-      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+      auto const scale = std::optional<numeric::scale_type>{};
+      return distribution_params<T>{
+        default_distribution_id<rep>(), range.first, range.second, scale};
     } else {
       auto& desc = it->second;
-      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+      return {desc.id,
+              static_cast<rep>(desc.lower_bound),
+              static_cast<rep>(desc.upper_bound),
+              desc.scale};
     }
   }
 
@@ -359,6 +372,23 @@ class data_profile {
     }
   }
 
+  // Users should pass integral values for bounds when setting the parameters for fixed-point.
+  // Otherwise the call with have no effect.
+  template <typename T,
+            typename Type_enum,
+            std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
+  void set_distribution_params(Type_enum type_or_group,
+                               distribution_id dist,
+                               T lower_bound,
+                               T upper_bound,
+                               numeric::scale_type scale)
+  {
+    for (auto tid : get_type_or_group(static_cast<int32_t>(type_or_group))) {
+      decimal_params[tid] = {
+        dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale};
+    }
+  }
+
   template <typename T, typename Type_enum, std::enable_if_t<cudf::is_chrono<T>(), T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index e1169e3bcd6..efc385cf10b 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  T value                             = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  T value                           = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
new file mode 100644
index 00000000000..a367036c494
--- /dev/null
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <type_traits>
+
+// This benchmark compares the cost of converting decimal <--> floating point
+template <typename InputType, typename OutputType>
+void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, OutputType>)
+{
+  static constexpr bool is_input_floating  = std::is_floating_point_v<InputType>;
+  static constexpr bool is_output_floating = std::is_floating_point_v<OutputType>;
+
+  static constexpr bool is_double =
+    std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
+  static constexpr bool is_32bit =
+    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
+  static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
+                                    std::is_same_v<OutputType, numeric::decimal128>;
+
+  // Skip floating --> floating and decimal --> decimal
+  if constexpr (is_input_floating == is_output_floating) {
+    state.skip("Meaningless conversion.");
+    return;
+  }
+
+  // Skip float <--> dec128
+  if constexpr (!is_double && is_128bit) {
+    state.skip("Ignoring float <--> dec128.");
+    return;
+  }
+
+  // Get settings
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const exp_mode = state.get_int64("exp_range");
+
+  // Exponent range: Range size is 10^6
+  // These probe the edges of the float and double ranges, as well as more common values
+  int const exp_min_array[] = {-307, -37, -14, -3, 8, 31, 301};
+  int const exp_range_size  = 6;
+  int const exp_min         = exp_min_array[exp_mode];
+  int const exp_max         = exp_min + exp_range_size;
+
+  // With exp range size of 6, decimal output (generated or casted-to) has 7 digits of precision
+  int const extra_digits_precision = 1;
+
+  // Exclude end range of double from float test
+  if (!is_double && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Range beyond end of float tests.");
+    return;
+  }
+
+  // The current float <--> decimal conversion algorithm is limited
+  static constexpr bool is_64bit = !is_32bit && !is_128bit;
+  if (is_32bit && (exp_mode != 3)) {
+    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
+    return;
+  }
+  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
+    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
+    return;
+  }
+  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
+    return;
+  }
+
+  // Type IDs
+  auto const input_id  = cudf::type_to_id<InputType>();
+  auto const output_id = cudf::type_to_id<OutputType>();
+
+  // Create data profile and scale
+  auto const [output_scale, profile] = [&]() {
+    if constexpr (is_input_floating) {
+      // Range for generated floating point values
+      auto get_pow10 = [](auto exp10) {
+        return std::pow(static_cast<InputType>(10), static_cast<InputType>(exp10));
+      };
+      InputType const floating_range_min = get_pow10(exp_min);
+      InputType const floating_range_max = get_pow10(exp_max);
+
+      // With exp range size of 6, output has 7 decimal digits of precision
+      auto const decimal_output_scale = exp_min - extra_digits_precision;
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(
+        input_id, distribution_id::NORMAL, floating_range_min, floating_range_max);
+
+      return std::pair{decimal_output_scale, profile};
+
+    } else {  // Generating decimals
+
+      using decimal_rep_type = typename InputType::rep;
+
+      // For exp range size 6 and precision 7, generates ints between 10 and 10^7,
+      // with scale factor of: exp_max - 7. This matches floating point generation.
+      int const digits_precision     = exp_range_size + extra_digits_precision;
+      auto const decimal_input_scale = numeric::scale_type{exp_max - digits_precision};
+
+      // Range for generated integer values
+      auto get_pow10 = [](auto exp10) {
+        return numeric::detail::ipow<decimal_rep_type, numeric::Radix::BASE_10>(exp10);
+      };
+      auto const decimal_range_min = get_pow10(digits_precision - exp_range_size);
+      auto const decimal_range_max = get_pow10(digits_precision);
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(input_id,
+                                                                       distribution_id::NORMAL,
+                                                                       decimal_range_min,
+                                                                       decimal_range_max,
+                                                                       decimal_input_scale);
+
+      return std::pair{0, profile};
+    }
+  }();
+
+  // Generate input data
+  auto const input_col  = create_random_column(input_id, row_count{num_rows}, profile);
+  auto const input_view = input_col->view();
+
+  // Output type
+  auto const output_type =
+    !is_input_floating ? cudf::data_type(output_id) : cudf::data_type(output_id, output_scale);
+
+  // Stream
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  // Run benchmark
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::cast(input_view, output_type); });
+
+  // Throughput statistics
+  state.add_element_count(num_rows);
+  state.add_global_memory_reads<InputType>(num_rows);
+  state.add_global_memory_writes<OutputType>(num_rows);
+}
+
+// Data types
+using data_types =
+  nvbench::type_list<float, double, numeric::decimal32, numeric::decimal64, numeric::decimal128>;
+
+NVBENCH_BENCH_TYPES(bench_cast_decimal, NVBENCH_TYPE_AXES(data_types, data_types))
+  .set_name("decimal_floating_conversion")
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("exp_range", nvbench::range(0, 6));
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index adde0ae1720..8c8d6756b00 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -120,6 +120,7 @@ class memory_stats_logger {
   }
 
  private:
+  // TODO change to resource_ref once set_current_device_resource supports it
   rmm::mr::device_memory_resource* existing_mr;
   rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
 };
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index 4e4eec3547f..ac0cab4071b 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -45,6 +45,8 @@ static std::string cuio_host_mem_param{
  * Initializes the default memory resource to use the RMM pool device resource.
  */
 struct nvbench_base_fixture {
+  using host_pooled_mr_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+
   inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
   inline auto make_pool()
@@ -90,12 +92,14 @@ struct nvbench_base_fixture {
 
   inline rmm::host_async_resource_ref make_cuio_host_pinned_pool()
   {
-    using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-    static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
-      std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-      size_t{1} * 1024 * 1024 * 1024);
+    if (!this->host_pooled_mr) {
+      // Don't store in static, as the CUDA context may be destroyed before static destruction
+      this->host_pooled_mr = std::make_shared<host_pooled_mr_t>(
+        std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+        size_t{1} * 1024 * 1024 * 1024);
+    }
 
-    return *mr;
+    return *this->host_pooled_mr;
   }
 
   inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
@@ -126,9 +130,16 @@ struct nvbench_base_fixture {
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
+  ~nvbench_base_fixture()
+  {
+    // Ensure the the pool is freed before the CUDA context is destroyed:
+    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+  }
+
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
   std::string rmm_mode{"pool"};
 
+  std::shared_ptr<host_pooled_mr_t> host_pooled_mr;
   std::string cuio_host_mode{"pinned"};
 };
 
diff --git a/cpp/benchmarks/fixture/nvbench_main.cpp b/cpp/benchmarks/fixture/nvbench_main.cpp
index f46cb11a6c3..5dfd67b1c54 100644
--- a/cpp/benchmarks/fixture/nvbench_main.cpp
+++ b/cpp/benchmarks/fixture/nvbench_main.cpp
@@ -15,29 +15,44 @@
  */
 
 #include <benchmarks/fixture/nvbench_fixture.hpp>
-#define NVBENCH_ENVIRONMENT cudf::nvbench_base_fixture
 
 #include <nvbench/main.cuh>
 
+#include <string>
 #include <vector>
 
+namespace cudf {
+
 // strip off the rmm_mode and cuio_host_mem parameters before passing the
 // remaining arguments to nvbench::option_parser
-#undef NVBENCH_MAIN_PARSE
-#define NVBENCH_MAIN_PARSE(argc, argv)                     \
-  nvbench::option_parser parser;                           \
-  std::vector<std::string> m_args;                         \
-  for (int i = 0; i < argc; ++i) {                         \
-    std::string arg = argv[i];                             \
-    if (arg == cudf::detail::rmm_mode_param) {             \
-      i += 2;                                              \
-    } else if (arg == cudf::detail::cuio_host_mem_param) { \
-      i += 2;                                              \
-    } else {                                               \
-      m_args.push_back(arg);                               \
-    }                                                      \
-  }                                                        \
-  parser.parse(m_args)
+void benchmark_arg_handler(std::vector<std::string>& args)
+{
+  std::vector<std::string> _cudf_tmp_args;
+
+  for (std::size_t i = 0; i < args.size(); ++i) {
+    std::string arg = args[i];
+    if (arg == cudf::detail::rmm_mode_param) {
+      i++;  // skip the next argument
+    } else if (arg == cudf::detail::cuio_host_mem_param) {
+      i++;  // skip the next argument
+    } else {
+      _cudf_tmp_args.push_back(arg);
+    }
+  }
+
+  args = _cudf_tmp_args;
+}
+
+}  // namespace cudf
+
+// Install arg handler
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) cudf::benchmark_arg_handler(args)
+
+// Global fixture setup:
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) \
+  [[maybe_unused]] auto env_state = cudf::nvbench_base_fixture(argc, argv);
 
 // this declares/defines the main() function using the definitions above
 NVBENCH_MAIN
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 3d5be41e25f..6e0b32219ce 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -39,6 +39,10 @@ class cuio_source_sink_pair {
     // delete the temporary file
     std::remove(file_name.c_str());
   }
+  // move constructor
+  cuio_source_sink_pair(cuio_source_sink_pair&& ss)            = default;
+  cuio_source_sink_pair& operator=(cuio_source_sink_pair&& ss) = default;
+
   /**
    * @brief Created a source info of the set type
    *
diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
new file mode 100644
index 00000000000..378134a2010
--- /dev/null
+++ b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size         = 512 << 20;
+constexpr cudf::size_type num_cols = 64;
+
+template <json_lines JsonLines>
+void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>)
+{
+  constexpr auto json_lines_bool = JsonLines == json_lines::YES;
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(json_lines_bool)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      timer.start();
+      auto const result        = cudf::io::read_json(read_options);
+      auto const num_rows_read = result.tbl->num_rows();
+      auto const num_cols_read = result.tbl->num_columns();
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+      CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <row_selection RowSelection,
+          normalize_single_quotes NormalizeSingleQuotes,
+          normalize_whitespace NormalizeWhitespace,
+          mixed_types_as_string MixedTypesAsString,
+          recovery_mode RecoveryMode>
+void BM_jsonlines_read_options(nvbench::state& state,
+                               nvbench::type_list<nvbench::enum_type<RowSelection>,
+                                                  nvbench::enum_type<NormalizeSingleQuotes>,
+                                                  nvbench::enum_type<NormalizeWhitespace>,
+                                                  nvbench::enum_type<MixedTypesAsString>,
+                                                  nvbench::enum_type<RecoveryMode>>)
+{
+  constexpr auto normalize_single_quotes_bool =
+    NormalizeSingleQuotes == normalize_single_quotes::YES;
+  constexpr auto normalize_whitespace_bool  = NormalizeWhitespace == normalize_whitespace::YES;
+  constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES;
+  constexpr auto recovery_mode_enum         = RecoveryMode == recovery_mode::RECOVER_WITH_NULL
+                                                ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                : cudf::io::json_recovery_mode_t::FAIL;
+  size_t const num_chunks                   = state.get_int64("num_chunks");
+  if (num_chunks > 1 && RowSelection == row_selection::ALL) {
+    state.skip(
+      "No point running the same benchmark multiple times for different num_chunks when all rows "
+      "are being selected anyway");
+    return;
+  }
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(true)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info())
+      .lines(true)
+      .normalize_single_quotes(normalize_single_quotes_bool)
+      .normalize_whitespace(normalize_whitespace_bool)
+      .mixed_types_as_string(mixed_types_as_string_bool)
+      .recovery_mode(recovery_mode_enum);
+
+  size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
+  auto mem_stats_logger   = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      cudf::size_type num_rows_read = 0;
+      cudf::size_type num_cols_read = 0;
+      timer.start();
+      switch (RowSelection) {
+        case row_selection::ALL: {
+          auto const result = cudf::io::read_json(read_options);
+          num_rows_read     = result.tbl->num_rows();
+          num_cols_read     = result.tbl->num_columns();
+          break;
+        }
+        case row_selection::BYTE_RANGE: {
+          for (uint64_t chunk = 0; chunk < num_chunks; chunk++) {
+            read_options.set_byte_range_offset(chunk * chunk_size);
+            read_options.set_byte_range_size(chunk_size);
+            auto const result = cudf::io::read_json(read_options);
+            num_rows_read += result.tbl->num_rows();
+            num_cols_read = result.tbl->num_columns();
+            if (num_cols_read)
+              CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+          }
+          break;
+        }
+        default: CUDF_FAIL("Unsupported row selection method");
+      }
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+NVBENCH_BENCH_TYPES(BM_jsonlines_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                                      nvbench::enum_type_list<normalize_single_quotes::NO,
+                                                              normalize_single_quotes::YES>,
+                                      nvbench::enum_type_list<normalize_whitespace::NO>,
+                                      nvbench::enum_type_list<mixed_types_as_string::NO>,
+                                      nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_single_quotes")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_whitespace")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+                    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_mixed_types_as_string")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_row_selection")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 5, 1));
+
+NVBENCH_BENCH_TYPES(BM_json_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>))
+  .set_name("json_reader")
+  .set_type_axes_names({"json_lines"})
+  .set_min_samples(6);
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index dd96f6fa4cd..8b79912c7ee 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,3 +169,68 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
     }
   },
   [](auto) { return std::string{}; })
+
+enum class json_lines : bool { YES, NO };
+
+enum class normalize_single_quotes : bool { YES, NO };
+
+enum class normalize_whitespace : bool { YES, NO };
+
+enum class mixed_types_as_string : bool { YES, NO };
+
+enum class recovery_mode : bool { FAIL, RECOVER_WITH_NULL };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  json_lines,
+  [](auto value) {
+    switch (value) {
+      case json_lines::YES: return "YES";
+      case json_lines::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_single_quotes,
+  [](auto value) {
+    switch (value) {
+      case normalize_single_quotes::YES: return "YES";
+      case normalize_single_quotes::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_whitespace,
+  [](auto value) {
+    switch (value) {
+      case normalize_whitespace::YES: return "YES";
+      case normalize_whitespace::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  mixed_types_as_string,
+  [](auto value) {
+    switch (value) {
+      case mixed_types_as_string::YES: return "YES";
+      case mixed_types_as_string::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  recovery_mode,
+  [](auto value) {
+    switch (value) {
+      case recovery_mode::FAIL: return "FAIL";
+      case recovery_mode::RECOVER_WITH_NULL: return "RECOVER_WITH_NULL";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..b7c214a8374 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,59 @@
 
 #include <nvbench/nvbench.cuh>
 
+namespace {
+
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t data_size    = 512 << 20;
+constexpr std::size_t Mbytes       = 1024 * 1024;
 
+template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
                      nvbench::state& state)
 {
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(source_sink.make_source_info());
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_orc(read_opts);
-      timer.stop();
 
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
+  if constexpr (is_chunked_read) {
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+        auto const output_limit_MB =
+          static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
+        auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));
+
+        auto reader =
+          cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
+        cudf::size_type num_rows{0};
+
+        timer.start();
+        do {
+          auto chunk = reader.read_chunk();
+          num_rows += chunk.tbl->num_rows();
+        } while (reader.has_next());
+        timer.stop();
+
+        CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+      });
+  } else {  // not is_chunked_read
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+
+        timer.start();
+        auto const result = cudf::io::read_orc(read_opts);
+        timer.stop();
+
+        CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+        CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+      });
+  }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+}  // namespace
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
@@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state,
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                          static_cast<int32_t>(data_type::FLOAT),
@@ -95,15 +123,21 @@ void BM_orc_read_io_compression(
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
 
-  cudf::size_type const cardinality = state.get_int64("cardinality");
-  cudf::size_type const run_length  = state.get_int64("run_length");
+  auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
+    if constexpr (chunked_read) {
+      return {0, 4};
+    } else {
+      return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
+              static_cast<cudf::size_type>(state.get_int64("run_length"))};
+    }
+  }();
   cuio_source_sink_pair source_sink(IOType);
 
   auto const num_rows_written = [&]() {
     auto const tbl = create_random_table(
       cycle_dtypes(d_type, num_cols),
       table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+      data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
     auto const view = tbl->view();
 
     cudf::io::orc_writer_options opts =
@@ -113,7 +147,23 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<chunked_read>(num_rows_written, source_sink, state);
+}
+
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, false>(state);
+}
+
+template <cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<Compression>>)
+{
+  // Only run benchmark using HOST_BUFFER IO.
+  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -146,3 +196,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
+
+// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
+  .set_name("orc_chunked_read_io_compression")
+  .set_type_axes_names({"compression"})
+  .set_min_samples(4)
+  // The input has approximately 520MB and 127K rows.
+  // The limits below are given in MBs.
+  .add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
+  .add_int64_axis("pass_read_limit_MB", {50, 250, 700});
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
new file mode 100644
index 00000000000..fbdcfb0ade9
--- /dev/null
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+// TODO: remove this once pinned/pooled is enabled by default in cuIO
+void set_cuio_host_pinned_pool()
+{
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
+    std::make_shared<rmm::mr::pinned_host_memory_resource>().get(), 256ul * 1024 * 1024);
+  cudf::io::set_host_memory_resource(*mr);
+}
+
+size_t get_num_reads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_reads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(state.get_int64("num_threads")) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_reads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY)
+        .max_page_size_rows(50000)
+        .max_page_size_bytes(1024 * 1024);
+
+    cudf::io::write_parquet(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_parquet_multithreaded_read_common(nvbench::state& state,
+                                          std::vector<cudf::type_id> const& d_types,
+                                          std::string const& label)
+{
+  size_t const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
+                                                  std::vector<cudf::type_id> const& d_types,
+                                                  std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  std::vector<cudf::io::table_with_metadata> chunks;
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 // divide chunk limits by number of threads so the number of chunks produced is the
+                 // same for all cases. this seems better than the alternative, which is to keep the
+                 // limits the same. if we do that, as the number of threads goes up, the number of
+                 // chunks goes down - so are actually benchmarking the same thing in that case?
+                 auto reader = cudf::io::chunked_parquet_reader(
+                   output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                 // read all the chunks
+                 do {
+                   auto table = reader.read_chunk();
+                 } while (reader.has_next());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
+  .set_name("parquet_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
+  .set_name("parquet_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
+  .set_name("parquet_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
+  .set_name("parquet_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
+  .set_name("parquet_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
+  .set_name("parquet_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index d721de0e8fd..d95fc0a5b59 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,126 +16,102 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class ConditionalJoin : public cudf::benchmark {};
 
 // For compatibility with the shared logic for equality (hash) joins, all of
 // the join lambdas defined by these macros accept a null_equality parameter
 // but ignore it (don't forward it to the underlying join implementation)
 // because conditional joins do not use this parameter.
-#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)            \
-  (::benchmark::State & st)                                                             \
-  {                                                                                     \
-    auto join = [](cudf::table_view const& left,                                        \
-                   cudf::table_view const& right,                                       \
-                   cudf::ast::operation binary_pred,                                    \
-                   cudf::null_equality compare_nulls) {                                 \
-      return cudf::conditional_inner_join(left, right, binary_pred);                    \
-    };                                                                                  \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);           \
+#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)            \
+  (::benchmark::State & st)                                          \
+  {                                                                  \
+    auto join = [](cudf::table_view const& left,                     \
+                   cudf::table_view const& right,                    \
+                   cudf::ast::operation binary_pred,                 \
+                   cudf::null_equality compare_nulls) {              \
+      return cudf::conditional_inner_join(left, right, binary_pred); \
+    };                                                               \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);           \
   }
 
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_left_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_left_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_full_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_full_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_anti_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_anti_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
-
-#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_semi_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_semi_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true);
 
 // inner join -----------------------------------------------------------------------
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 4a68ee3878e..af8fa1f9d94 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -16,12 +16,10 @@
 
 #include "join_common.hpp"
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_inner_join(nvbench::state& state,
-                         nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                         nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -35,15 +33,13 @@ void distinct_inner_join(nvbench::state& state,
     return hj_obj.inner_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_left_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -57,65 +53,18 @@ void distinct_left_join(nvbench::state& state,
     return hj_obj.left_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+NVBENCH_BENCH_TYPES(distinct_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 93401f01026..f7984b29d6b 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -34,7 +35,7 @@
 
 CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
-  int ithread = threadIdx.x + blockIdx.x * blockDim.x;
+  int ithread = cudf::detail::grid_1d::global_thread_id();
 
   if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
 }
@@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     double const x = curand_uniform_double(&localState);
 
     build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
@@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     key_type val;
     double x = curand_uniform_double(&localState);
 
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index 1c02a4488ac..c4a39da4662 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void nvbench_inner_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -33,15 +31,12 @@ void nvbench_inner_join(nvbench::state& state,
     return hj_obj.inner_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_left_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_left_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -53,15 +48,12 @@ void nvbench_left_join(nvbench::state& state,
     return hj_obj.left_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_full_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_full_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -73,122 +65,23 @@ void nvbench_full_join(nvbench::state& state,
     return hj_obj.full_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+NVBENCH_BENCH_TYPES(nvbench_inner_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_full_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 9f869ddb1ac..9e23d28b363 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -41,6 +41,11 @@
 
 #include <vector>
 
+using JOIN_KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using JOIN_NULLABLE_RANGE = nvbench::enum_type_list<false, true>;
+
+auto const JOIN_SIZE_RANGE = std::vector<nvbench::int64_t>{1000, 100'000, 10'000'000};
+
 struct null75_generator {
   thrust::minstd_rand engine;
   thrust::uniform_int_distribution<unsigned> rand_gen;
@@ -55,52 +60,42 @@ struct null75_generator {
 
 enum class join_t { CONDITIONAL, MIXED, HASH };
 
-inline void skip_helper(nvbench::state& state)
-{
-  auto const build_table_size = state.get_int64("Build Table Size");
-  auto const probe_table_size = state.get_int64("Probe Table Size");
-
-  if (build_table_size > probe_table_size) {
-    state.skip("Large build tables are skipped.");
-    return;
-  }
-
-  if (build_table_size * 100 <= probe_table_size) {
-    state.skip("Large probe tables are skipped.");
-    return;
-  }
-}
-
-template <typename key_type,
-          typename payload_type,
+template <typename Key,
           bool Nullable,
           join_t join_type = join_t::HASH,
           typename state_type,
           typename Join>
 void BM_join(state_type& state, Join JoinFunc)
 {
-  auto const build_table_size = [&]() {
+  auto const right_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(0));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Build Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("right_size"));
     }
   }();
-  auto const probe_table_size = [&]() {
+  auto const left_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(1));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Probe Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("left_size"));
     }
   }();
 
+  if constexpr (std::is_same_v<state_type, nvbench::state>) {
+    if (right_size > left_size) {
+      state.skip("Skip large right table");
+      return;
+    }
+  }
+
   double const selectivity = 0.3;
   int const multiplicity   = 1;
 
   // Generate build and probe tables
-  auto build_random_null_mask = [](int size) {
+  auto right_random_null_mask = [](int size) {
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
@@ -111,62 +106,62 @@ void BM_join(state_type& state, Join JoinFunc)
                                   rmm::mr::get_current_device_resource());
   };
 
-  std::unique_ptr<cudf::column> build_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(build_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size);
+  std::unique_ptr<cudf::column> right_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(right_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         right_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), right_size);
   }();
-  std::unique_ptr<cudf::column> probe_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size);
+  std::unique_ptr<cudf::column> left_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(left_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         left_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), left_size);
   }();
 
-  generate_input_tables<key_type, cudf::size_type>(
-    build_key_column0->mutable_view().data<key_type>(),
-    build_table_size,
-    probe_key_column0->mutable_view().data<key_type>(),
-    probe_table_size,
-    selectivity,
-    multiplicity);
+  // build table is right table, probe table is left table
+  generate_input_tables<Key, cudf::size_type>(right_key_column0->mutable_view().data<Key>(),
+                                              right_size,
+                                              left_key_column0->mutable_view().data<Key>(),
+                                              left_size,
+                                              selectivity,
+                                              multiplicity);
 
-  // Copy build_key_column0 and probe_key_column0 into new columns.
+  // Copy right_key_column0 and left_key_column0 into new columns.
   // If Nullable, the new columns will be assigned new nullmasks.
-  auto const build_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(build_key_column0->view());
+  auto const right_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(right_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(build_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(right_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
-  auto const probe_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(probe_key_column0->view());
+  auto const left_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(left_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(left_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
 
-  auto init = cudf::make_fixed_width_scalar<payload_type>(static_cast<payload_type>(0));
-  auto build_payload_column = cudf::sequence(build_table_size, *init);
-  auto probe_payload_column = cudf::sequence(probe_table_size, *init);
+  auto init                 = cudf::make_fixed_width_scalar<Key>(static_cast<Key>(0));
+  auto right_payload_column = cudf::sequence(right_size, *init);
+  auto left_payload_column  = cudf::sequence(left_size, *init);
 
   CUDF_CHECK_CUDA(0);
 
-  cudf::table_view build_table(
-    {build_key_column0->view(), build_key_column1->view(), *build_payload_column});
-  cudf::table_view probe_table(
-    {probe_key_column0->view(), probe_key_column1->view(), *probe_payload_column});
+  cudf::table_view right_table(
+    {right_key_column0->view(), right_key_column1->view(), *right_payload_column});
+  cudf::table_view left_table(
+    {left_key_column0->view(), left_key_column1->view(), *left_payload_column});
 
   // Setup join parameters and result table
   [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
@@ -177,8 +172,8 @@ void BM_join(state_type& state, Join JoinFunc)
     for (auto _ : state) {
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
-      auto result = JoinFunc(probe_table.select(columns_to_join),
-                             build_table.select(columns_to_join),
+      auto result = JoinFunc(left_table.select(columns_to_join),
+                             right_table.select(columns_to_join),
                              cudf::null_equality::UNEQUAL);
     }
   }
@@ -191,10 +186,10 @@ void BM_join(state_type& state, Join JoinFunc)
         cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
-                               probe_table.select({1}),
-                               build_table.select({1}),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
+                               left_table.select({1}),
+                               right_table.select({1}),
                                left_zero_eq_right_zero,
                                cudf::null_equality::UNEQUAL,
                                stream_view);
@@ -203,8 +198,8 @@ void BM_join(state_type& state, Join JoinFunc)
     if constexpr (join_type == join_t::HASH) {
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
                                cudf::null_equality::UNEQUAL,
                                stream_view);
       });
@@ -223,7 +218,7 @@ void BM_join(state_type& state, Join JoinFunc)
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
       auto result =
-        JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
+        JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
     }
   }
 }
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 96bbd1bc58e..3e398e721fa 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,42 +16,42 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class Join : public cudf::benchmark {};
 
-#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_anti_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_anti_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true);
 
-#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_semi_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_semi_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true);
 
 // left anti-join -------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 67be4640f84..129ea62e7a6 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_inner_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_inner_join(nvbench::state& state,
+                              nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -37,15 +35,13 @@ void nvbench_mixed_inner_join(
                                   compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -61,15 +57,13 @@ void nvbench_mixed_left_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_full_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_full_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -85,15 +79,13 @@ void nvbench_mixed_full_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_semi_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_semi_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -109,15 +101,13 @@ void nvbench_mixed_left_semi_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_anti_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_anti_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -133,200 +123,40 @@ void nvbench_mixed_left_anti_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
 NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left semi join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left anti join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_semi_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_anti_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index a54d7d48dc4..06b793bf5f1 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -28,9 +26,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/random.h>
-
-class JsonPath : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 std::vector<std::string> const Books{
   R"json({
@@ -77,10 +73,9 @@ struct json_benchmark_row_builder {
   cudf::column_device_view const d_book_pct;           // Book percentage
   cudf::column_device_view const d_misc_order;         // Misc-Store order
   cudf::column_device_view const d_store_order;        // Books-Bicycles order
-  int32_t* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
-  thrust::minstd_rand rng{5236};
-  thrust::uniform_int_distribution<int> dist{};
+  cudf::detail::input_offsetalator d_offsets;
 
   // internal data structure for {bytes, out_ptr} with operator+=
   struct bytes_and_ptr {
@@ -98,12 +93,10 @@ struct json_benchmark_row_builder {
                                     cudf::size_type num_items,
                                     bytes_and_ptr& output_str)
   {
-    using param_type = thrust::uniform_int_distribution<int>::param_type;
-    dist.param(param_type{0, d_books_bicycles[this_idx].size() - 1});
     cudf::string_view comma(",\n", 2);
     for (int i = 0; i < num_items; i++) {
       if (i > 0) { output_str += comma; }
-      int idx   = dist(rng);
+      int idx   = threadIdx.x % d_books_bicycles[this_idx].size();
       auto item = d_books_bicycles[this_idx].element<cudf::string_view>(idx);
       output_str += item;
     }
@@ -155,7 +148,7 @@ struct json_benchmark_row_builder {
       output_str += Misc;
     }
     output_str += brace2;
-    if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
+    if (!output_str.ptr) { d_sizes[idx] = output_str.bytes; }
   }
 };
 
@@ -182,41 +175,42 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
-void BM_case(benchmark::State& state, std::string query_arg)
+static std::string queries[] = {"$",
+                                "$.store",
+                                "$.store.book",
+                                "$.store.*",
+                                "$.store.book[*]",
+                                "$.store.book[*].category",
+                                "$.store['bicycle']",
+                                "$.store.book[*]['isbn']",
+                                "$.store.bicycle[1]"};
+
+static void bench_query(nvbench::state& state)
 {
   srand(5236);
-  int num_rows      = state.range(0);
-  int desired_bytes = state.range(1);
+
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const desired_bytes = static_cast<cudf::size_type>(state.get_int64("bytes"));
+  auto const query         = state.get_int64("query");
+  auto const json_path     = queries[query];
+
+  auto const stream = cudf::get_default_stream();
   auto input        = build_json_string_column(desired_bytes, num_rows);
   cudf::strings_column_view scv(input->view());
-  size_t num_chars = scv.chars_size(cudf::get_default_stream());
+  size_t num_chars = scv.chars_size(stream);
 
-  std::string json_path(query_arg);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::get_json_object(scv, json_path);
-    CUDF_CUDA_TRY(cudaStreamSynchronize(0));
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  // This isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character but in spirit it does.
+  state.add_global_memory_reads<nvbench::int8_t>(num_chars);
 
-  // this isn't strictly 100% accurate. a given query isn't necessarily
-  // going to visit every single incoming character.  but in spirit it does.
-  state.SetBytesProcessed(state.iterations() * num_chars);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::get_json_object(scv, json_path);
+  });
 }
 
-#define JSON_BENCHMARK_DEFINE(name, query)                                                  \
-  BENCHMARK_DEFINE_F(JsonPath, name)(::benchmark::State & state) { BM_case(state, query); } \
-  BENCHMARK_REGISTER_F(JsonPath, name)                                                      \
-    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}})                          \
-    ->UseManualTime()                                                                       \
-    ->Unit(benchmark::kMillisecond);
-
-JSON_BENCHMARK_DEFINE(query0, "$");
-JSON_BENCHMARK_DEFINE(query1, "$.store");
-JSON_BENCHMARK_DEFINE(query2, "$.store.book");
-JSON_BENCHMARK_DEFINE(query3, "$.store.*");
-JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
-JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
-JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
-JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
-JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
+NVBENCH_BENCH(bench_query)
+  .set_name("json_path")
+  .add_int64_axis("bytes", {300, 600, 4096})
+  .add_int64_axis("num_rows", {100, 1000, 100000, 400000})
+  .add_int64_axis("query", {0, 1, 2, 3, 4, 5, 6, 7, 8});
diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp
new file mode 100644
index 00000000000..3d0f1865490
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_strings.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_strings(nvbench::state& state)
+{
+  auto stream = cudf::get_default_stream();
+
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const source_tables = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
+  auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
+  auto const lhs        = sorted_lhs->view().column(0);
+  auto const rhs        = sorted_rhs->view().column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
+                    cudf::strings_column_view(rhs).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::merge(
+      {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_strings)
+  .set_name("merge_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/replace/nulls.cpp b/cpp/benchmarks/replace/nulls.cpp
new file mode 100644
index 00000000000..ccd00050789
--- /dev/null
+++ b/cpp/benchmarks/replace/nulls.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void replace_nulls(nvbench::state& state)
+{
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+
+  auto const input_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+  auto const input = input_table->view().column(0);
+  auto const repl  = input_table->view().column(1);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = cudf::strings_column_view(input).chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::replace_nulls(input, repl); });
+}
+
+NVBENCH_BENCH(replace_nulls)
+  .set_name("replace_nulls")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp
new file mode 100644
index 00000000000..4499e34af77
--- /dev/null
+++ b/cpp/benchmarks/reshape/interleave.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_interleave(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const str_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
+  auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);
+
+  auto const source_view = source_table->view();
+  auto const stream      = cudf::get_default_stream();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
+                    cudf::strings_column_view(source_view.column(1)).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::interleave_columns(source_view);
+  });
+}
+
+NVBENCH_BENCH(bench_interleave)
+  .set_name("interleave_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index a7db972d39f..cd4d3ca964b 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -75,5 +75,5 @@ void bench_case(nvbench::state& state)
 NVBENCH_BENCH(bench_case)
   .set_name("case")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 6d839c1de64..ae6c8b844c8 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
 }
 
 // longer pattern lengths demand more working memory per string
-std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
 static void bench_contains(nvbench::state& state)
 {
@@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
   .add_int64_axis("row_width", {32, 64, 128, 256, 512})
   .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
-  .add_int64_axis("pattern", {0, 1});
+  .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index a656010dca5..f964bc5d224 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -25,10 +25,13 @@
 
 #include <nvbench/nvbench.cuh>
 
+static std::string patterns[] = {"\\d+", "a"};
+
 static void bench_count(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
-  std::string pattern = "\\d+";
+  auto const pattern = patterns[pattern_index];
 
   auto prog = cudf::strings::regex_program::create(pattern);
 
@@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
 NVBENCH_BENCH(bench_count)
   .set_name("count")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index e866092f3a3..a9c620e4bf0 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -16,78 +16,75 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
+#include <nvbench/nvbench.cuh>
 
-enum FindAPI { find, find_multi, contains, starts_with, ends_with };
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate);
 
-class StringFindScalar : public cudf::benchmark {};
-
-static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
+static void bench_find_string(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-  cudf::string_scalar target("+");
-  cudf::test::strings_column_wrapper targets({"+", "-"});
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const api       = state.get_string("api");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (find_api) {
-      case find: cudf::strings::find(input, target); break;
-      case find_multi:
-        cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-        break;
-      case contains: cudf::strings::contains(input, target); break;
-      case starts_with: cudf::strings::starts_with(input, target); break;
-      case ends_with: cudf::strings::ends_with(input, target); break;
-    }
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto const stream = cudf::get_default_stream();
+  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
+  cudf::string_scalar target(h_targets[2]);
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api.substr(0, 4) == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
   }
-}
 
-#define STRINGS_BENCHMARK_DEFINE(name)                    \
-  BENCHMARK_DEFINE_F(StringFindScalar, name)              \
-  (::benchmark::State & st) { BM_find_scalar(st, name); } \
-  BENCHMARK_REGISTER_F(StringFindScalar, name)            \
-    ->Apply(generate_bench_args)                          \
-    ->UseManualTime()                                     \
-    ->Unit(benchmark::kMillisecond);
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+  } else if (api == "find_multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+  } else if (api == "starts_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+  } else if (api == "ends_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+  }
+}
 
-STRINGS_BENCHMARK_DEFINE(find)
-STRINGS_BENCHMARK_DEFINE(find_multi)
-STRINGS_BENCHMARK_DEFINE(contains)
-STRINGS_BENCHMARK_DEFINE(starts_with)
-STRINGS_BENCHMARK_DEFINE(ends_with)
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_string")
+  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 161328ae088..3aff75d840e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -60,13 +60,15 @@ constexpr int block_size = 256;
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
-  using F               = Functor<T, functor_type>;
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < n_rows) {
+  using F           = Functor<T, functor_type>;
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (int c = 0; c < n_cols; c++) {
       A[c][index] = F::f(A[c][index]);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }
 }
 
@@ -74,12 +76,14 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
-  using F               = Functor<T, functor_type>;
-  T* A                  = source_column.data<T>();
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < source_column.size()) {
-    A[index] = F::f(A[index]);
-    index += blockDim.x * gridDim.x;
+  using F           = Functor<T, functor_type>;
+  T* A              = source_column.data<T>();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < source_column.size()) {
+    auto const index = static_cast<cudf::size_type>(tidx);
+    A[index]         = F::f(A[index]);
+    tidx += stride;
   }
 }
 
@@ -127,14 +131,15 @@ template <FunctorType functor_type>
 CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
-  cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (index < n_rows) {
+  auto tidx                    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride            = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (cudf::size_type i = 0; i < source.num_columns(); i++) {
       cudf::type_dispatcher(
         source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }  // while
 }
 
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index f79e4c37228..f75b5aef7af 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,6 +25,11 @@ else()
   list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
 endif()
 list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+# This warning needs to be suppressed because some parts of cudf instantiate templated CCCL
+# functions in contexts where the resulting instantiations would have internal linkage (e.g. in
+# anonymous namespaces). In such contexts, the visibility attribute on the template is ignored, and
+# the compiler issues a warning. This is not a problem and will be fixed in future versions of CCCL.
+list(APPEND CUDF_CUDA_FLAGS -diag-suppress=1407)
 
 if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 8c4e2b47fca..752c2028350 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -23,8 +23,9 @@ target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS})
 function(jit_preprocess_files)
   cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN})
 
-  foreach(inc IN LISTS libcudacxx_raw_includes)
-    list(APPEND libcudacxx_includes "-I${inc}")
+  set(includes)
+  foreach(inc IN LISTS libcudacxx_raw_includes CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND includes "-I${inc}")
   endforeach()
   foreach(ARG_FILE ${ARG_FILES})
     set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp)
@@ -44,8 +45,7 @@ function(jit_preprocess_files)
         $<TARGET_FILE:jitify_preprocess> ${ARG_FILE} -o
         ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17
         -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include
-        -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS}
-        --no-preinclude-workarounds --no-replace-pragma-once
+        -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once
       COMMENT "Custom command to JIT-compile files."
     )
   endforeach()
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 114a1f98a68..0afdc526981 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -26,13 +26,20 @@ include_guard(GLOBAL)
 # pyarrow.
 function(find_libarrow_in_python_wheel PYARROW_VERSION)
   string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_SO_VER)
-  # The soname for Arrow libraries is constructed using the major version plus "00". Note that,
-  # although it may seem like it due to Arrow almost exclusively releasing new major versions (i.e.
-  # `${MINOR_VERSION}${PATCH_VERSION}` is almost always equivalent to "00"),
-  # the soname is not generated by concatenating the major, minor, and patch versions into a single
-  # version number soname, just `${MAJOR_VERSION}00`
-  set(PYARROW_LIB "libarrow.so.${PYARROW_SO_VER}00")
+  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
+  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
+
+  # Ensure that the major and minor versions are two digits long
+  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
+  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
+  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
+    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
+  endif()
+  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
+    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
+  endif()
+
+  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
 
   string(
     APPEND
@@ -68,37 +75,6 @@ list(POP_BACK CMAKE_PREFIX_PATH)
   find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
   add_library(arrow_shared ALIAS Arrow::Arrow)
 
-  # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
-  # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
-  # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
-  # We determine what options to use by checking the glibc version on the current system, which is
-  # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
-  # not build successfully without also propagating these options to builds of GTest. Similarly,
-  # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
-  # ignoring these limitations since we don't anticipate using this feature except for building
-  # wheels.
-  enable_language(C)
-  execute_process(
-    COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
-    OUTPUT_VARIABLE GLIBC_EXECUTABLE
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  execute_process(
-    COMMAND ${GLIBC_EXECUTABLE}
-    OUTPUT_VARIABLE GLIBC_OUTPUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  string(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
-  string(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
-  string(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
-  list(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
-  if(GLIBC_VERSION_MINOR LESS 28)
-    target_compile_options(
-      Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
-                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
-    )
-  endif()
-
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
 endfunction()
@@ -334,7 +310,20 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         "
       )
     endif()
-
+    rapids_cmake_install_lib_dir(lib_dir)
+    if(TARGET arrow_static)
+      get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      # The `arrow_static` library is leaking a dependency on the object libraries it was built with
+      # we need to remove this from the interface, since keeping them around would cause duplicate
+      # symbols and CMake export errors
+      if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute")
+        string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs
+                       "${interface_libs}"
+        )
+        set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}")
+        get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      endif()
+    endif()
     rapids_export(
       BUILD Arrow
       VERSION ${VERSION}
@@ -441,7 +430,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.2
+      16.1.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake
new file mode 100644
index 00000000000..b0ece38b8ef
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone flatbuffers
+function(find_and_configure_flatbuffers VERSION)
+
+  rapids_cpm_find(
+    flatbuffers ${VERSION}
+    GLOBAL_TARGETS flatbuffers
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  rapids_export_find_package_root(
+    BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET cudf-exports
+  )
+
+endfunction()
+
+find_and_configure_flatbuffers(24.3.25)
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index cfb219448f1..10e6b026d9a 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -17,22 +17,7 @@ function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
   # Find or install GoogleTest
-  rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports INSTALL_EXPORT_SET cudf-testing-exports)
-
-  if(GTest_ADDED)
-    rapids_export(
-      BUILD GTest
-      VERSION ${GTest_VERSION}
-      EXPORT_SET GTestTargets
-      GLOBAL_TARGETS gtest gmock gtest_main gmock_main
-      NAMESPACE GTest::
-    )
-
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports
-    )
-  endif()
+  rapids_cpm_gtest(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
new file mode 100644
index 00000000000..025bff7d8f0
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -0,0 +1,32 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds nanoarrow and sets any additional necessary environment variables.
+function(find_and_configure_nanoarrow)
+  # Currently we need to always build nanoarrow so we don't pickup a previous installed version
+  set(CPM_DOWNLOAD_nanoarrow ON)
+  rapids_cpm_find(
+    nanoarrow 0.5.0
+    GLOBAL_TARGETS nanoarrow
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_SHALLOW FALSE
+    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
+  )
+  set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
+endfunction()
+
+find_and_configure_nanoarrow()
diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index bbd22693ba4..84c27dd9d56 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -18,9 +18,6 @@ function(find_and_configure_nvbench)
   include(${rapids-cmake-dir}/cpm/nvbench.cmake)
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
-
   rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
index c722c4f70f1..e236d586522 100644
--- a/cpp/cmake/thirdparty/get_nvtx.cmake
+++ b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -12,16 +12,14 @@
 # the License.
 # =============================================================================
 
-# This function finds NVTX and sets any additional necessary environment variables.
+# Need to call rapids_cpm_nvtx3 to get support for an installed version of nvtx3 and to support
+# installing it ourselves
 function(find_and_configure_nvtx)
-  rapids_cpm_find(
-    NVTX3 3.1.0
-    GLOBAL_TARGETS nvtx3-c nvtx3-cpp
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
-    GIT_TAG v3.1.0
-    GIT_SHALLOW TRUE SOURCE_SUBDIR c
-  )
+  include(${rapids-cmake-dir}/cpm/nvtx3.cmake)
+
+  # Find or install nvtx3
+  rapids_cpm_nvtx3(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
 endfunction()
 
 find_and_configure_nvtx()
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 68fc8979c46..059f713e7a5 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,20 +18,45 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
+          "fixed_in" : ""
+        },
+        {
+          "file": "cccl/kernel_pointer_hiding.diff",
+          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
+          "fixed_in": "2.4"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
deleted file mode 100644
index 04f96f49b48..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/nvbench/main.cuh b/nvbench/main.cuh
-index 0ba82d7..cca5273 100644
---- a/nvbench/main.cuh
-+++ b/nvbench/main.cuh
-@@ -54,6 +54,16 @@
- // clang-format on
- #endif
-
-+#ifndef NVBENCH_ENVIRONMENT
-+namespace nvbench {
-+struct no_environment
-+{
-+  no_environment(int, char const *const *) {}
-+};
-+}
-+#define NVBENCH_ENVIRONMENT nvbench::no_environment
-+#endif
-+
- #define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
-   nvbench::option_parser parser;                                                                   \
-   parser.parse(argc, argv)
-@@ -77,6 +87,7 @@
-     printer.set_total_state_count(total_states);                                                   \
-                                                                                                    \
-     printer.set_completed_state_count(0);                                                          \
-+    [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(argc, argv);                             \
-     for (auto &bench_ptr : benchmarks)                                                             \
-     {                                                                                              \
-       bench_ptr->set_printer(printer);                                                             \
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
deleted file mode 100644
index ad9b19c29c1..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ /dev/null
@@ -1,14 +0,0 @@
-
-{
-  "packages" : {
-    "nvbench" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nvbench_global_setup.diff",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
new file mode 100644
index 00000000000..27ff16744f5
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
@@ -0,0 +1,47 @@
+diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+index 046eb83c0..8047c9701 100644
+--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+@@ -53,41 +53,15 @@ namespace cuda_cub
+ 
+ namespace __copy
+ {
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
+-{
+-  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+-  const auto n = thrust::distance(first, last);
+-  if (n > 0)
+-  {
+-    cudaError status;
+-    status = trivial_copy_device_to_device(
+-      policy,
+-      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+-      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
+-      n);
+-    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
+-  }
+-
+-  return result + n;
+-}
+ 
+ template <class Derived, class InputIt, class OutputIt>
+ OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
++  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+ {
+   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
+ }
+ 
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION
+-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+-{
+-  return device_to_device(
+-    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+-}
+ } // namespace __copy
+ 
+ } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
new file mode 100644
index 00000000000..6ae1e1c917b
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
@@ -0,0 +1,25 @@
+diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
+index 2a3cc4e33..8fb337b26 100644
+--- a/thrust/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/thrust/system/cuda/detail/dispatch.h
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
+ /**
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..fee46046194
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index 0606485bb..dbb99ff13 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
+     {
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index f39613adb..75bd16ff9 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
+   };
+ 
+   /// SM60
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     static constexpr int threads_per_block  = 256;
+     static constexpr int items_per_thread   = 16;
+diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+index 419908c4e..6ab0840e1 100644
+--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
+   /// SM600
+   struct Policy600
+       : DefaultTuning
+-      , ChainedPolicy<600, Policy600, Policy520>
++      , ChainedPolicy<600, Policy600, Policy600>
+   {};
+ 
+   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..cb0cc55f4d2
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
+index eb76ebb0b..c6c529a50 100644
+--- a/cub/cub/block/block_merge_sort.cuh
++++ b/cub/cub/block/block_merge_sort.cuh
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+ 
+-#pragma unroll
++#pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+       {
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
+index 7d9e8622f..da5627306 100644
+--- a/cub/cub/thread/thread_sort.cuh
++++ b/cub/cub/thread/thread_sort.cuh
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
+ {
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-#pragma unroll
++#pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 8188c466312..ff80c2daab8 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
+# libcudf C++ Developer Guide
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -84,7 +84,7 @@ prefixed with an underscore.
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_async_resource_ref mr)
 {
   ...
 }
@@ -194,9 +194,10 @@ and produce `unique_ptr`s to owning objects as output. For example,
 std::unique_ptr<table> sort(table_view const& input);
 ```
 
-## rmm::device_memory_resource
+## Memory Resources
 
-libcudf allocates all device memory via RMM memory resources (MR). See the
+libcudf allocates all device memory via RMM memory resources (MR) or CUDA MRs. Either type
+can be passed to libcudf functions via `rmm::device_async_resource_ref` parameters. See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
@@ -206,6 +207,27 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
+### Resource Refs
+
+Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
+that enables consumers to specify properties of resources that they expect. These are defined
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
+`rmm/resource_ref.hpp`:
+ - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
+    of device-accessible memory.
+ - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of device-accessible memory.
+ - `rmm::host_resource_ref` accepts a memory resource that provides synchronous allocation of host-
+    accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host-accessible memory.
+ - `rmm::host_device_resource_ref` accepts a memory resource that provides synchronous allocation of
+    host- and device-accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host- and device-accessible memory.
+
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+
 ## cudf::column
 
 `cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
@@ -519,23 +541,23 @@ how device memory is allocated.
 
 ### Output Memory
 
-Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a
-`device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
-to allocate any memory for returned objects. It should therefore be passed into functions whose
-outputs will be returned. Example:
+Any libcudf API that allocates memory that is *returned* to a user must accept a
+`rmm::device_async_resource_ref` as the last parameter. Inside the API, this memory resource must
+be used to allocate any memory for returned objects. It should therefore be passed into functions
+whose outputs will be returned. Example:
 
 ```c++
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
 void does_not_allocate_output_memory(...);
 ```
 
-This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+This rule automatically applies to all detail APIs that allocate memory. Any detail API may be
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
@@ -549,7 +571,7 @@ obtained from `rmm::mr::get_current_device_resource()` for temporary memory allo
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::mr::device_memory_resource mr * = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -561,11 +583,11 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`s for device memory allocation with automated lifetime management.
+use memory resources for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
-`device_memory_resource`. If no resource is explicitly provided, uses
+memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
 `rmm::mr::get_current_device_resource()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the
@@ -806,7 +828,7 @@ This iterator returns the validity of the underlying element (`true` or `false`)
 
 The proliferation of data types supported by libcudf can result in long compile times. One area
 where compile time was a problem is in types used to store indices, which can be any integer type.
-The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
+The "indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
 used for index types (integers) without requiring a type-specific instance. It can be used for any
 iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
 `int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
@@ -834,6 +856,41 @@ thrust::lower_bound(rmm::exec_policy(stream),
                     thrust::less<Element>());
 ```
 
+### Offset-normalizing iterators
+
+Like the [indexalator](#index-normalizing-iterators),
+the "offsetalator", or offset-normalizing iterator (`include/cudf/detail/offsetalator.cuh`), can be
+used for offset column types (`INT32` or `INT64` only) without requiring a type-specific instance.
+This is helpful when reading or building [strings columns](#strings-columns).
+The normalized type is `int64` which means an `input_offsetsalator` will return `int64` type values
+for both `INT32` and `INT64` offsets columns.
+Likewise, an `output_offselator` can accept `int64` type values to store into either an
+`INT32` or `INT64` output offsets column created appropriately.
+
+Use the `cudf::detail::offsetalator_factory` to create an appropriate input or output iterator from an offsets column_view.
+Example input iterator usage:
+
+```c++
+  // convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  auto d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+  // use d_offsets to address the output row bytes
+```
+
+Example output iterator usage:
+
+```c++
+    // create offsets column as either INT32 or INT64 depending on the number of bytes
+    auto offsets_column = cudf::strings::detail::create_offsets_child_column(total_bytes,
+                                                                             offsets_count,
+                                                                             stream, mr);
+    auto d_offsets =
+      cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
+    // write appropriate offset values to d_offsets
+```
+
 ## Namespaces
 
 ### External
@@ -921,13 +978,14 @@ Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct
 Example usage:
 
 ```c++
-CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
+CUDF_EXPECTS(cudf::have_same_types(lhs, rhs), "Type mismatch", cudf::data_type_error);
 ```
 
 The first argument is the conditional expression expected to resolve to `true` under normal
-conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
-`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
-error that has occurred and is used for the exception's `what()` message.
+conditions. The second argument to `CUDF_EXPECTS` is a short description of the error that has
+occurred and is used for the exception's `what()` message. If the conditional evaluates to
+`false`, then an error has occurred and an instance of the exception class in the third argument
+(or the default, `cudf::logic_error`) is thrown.
 
 There are times where a particular code path, if reached, should indicate an error no matter what.
 For example, often the `default` case of a `switch` statement represents an invalid alternative.
@@ -1026,6 +1084,12 @@ types such as numeric types and timestamps/durations, adding support for nested
 Enabling an algorithm differently for different types uses either template specialization or SFINAE,
 as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dispatched-code-paths).
 
+## Comparing Data Types
+
+When comparing the data types of two columns or scalars, do not directly compare
+`a.type() == b.type()`. Nested types such as lists of structs of integers will not be handled
+properly if only the top level type is compared. Instead, use the `cudf::have_same_types` function.
+
 # Type Dispatcher
 
 libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
@@ -1212,18 +1276,20 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
 
 Strings are represented as a column with a data device buffer and a child offsets column.
 The parent column's type is `STRING` and its data holds all the characters across all the strings packed together
-but its size represents the number of strings in the column, and its null mask represents the
-validity of each string. To summarize, the strings column children are:
-
-1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
-   string in a dense data buffer of all characters.
+but its size represents the number of strings in the column and its null mask represents the
+validity of each string.
 
-With this representation, `data[offsets[i]]` is the first character of string `i`, and the
-size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
-this compound column representation of strings.
+The strings column contains a single, non-nullable child column
+of offset elements that indicates the byte position offset to the beginning of each
+string in the dense data buffer of all characters. With this representation, `data[offsets[i]]` is the
+first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`.
+The following image shows an example of this compound column representation of strings.
 
 ![strings](strings.png)
 
+The type of the offsets column is either `INT32` or `INT64` depending on the number of bytes in the data buffer.
+See [`cudf::strings_view`](#cudfstrings_column_view-and-cudfstring_view) for more information on processing individual string rows.
+
 ## Structs columns
 
 A struct is a nested data type with a set of child columns each representing an individual field
@@ -1266,7 +1332,7 @@ struct column's layout is as follows. (Note that null masks should be read from
 }
 ```
 
-The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of
+The last struct row (index 3) is not null, but has a null value in the `INT32` field. Also, row 2 of
 the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in
 the null masks of both struct fields.
 
@@ -1322,18 +1388,27 @@ libcudf provides view types for nested column types as well as for the data elem
 
 ### cudf::strings_column_view and cudf::string_view
 
-`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
-any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
-`cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
-read-only object instance that points to device memory inside the strings column. It's lifespan is
-the same (or less) as the column it views.
+A `cudf::strings_column_view` wraps a strings column and contains a parent
+`cudf::column_view` as a view of the strings column and an offsets `cudf::column_view`
+which is a child of the parent.
+The parent view contains the offset, size, and validity mask for the strings column.
+The offsets view is non-nullable with `offset()==0` and its own size.
+Since the offset column type can be either `INT32` or `INT64` it is useful to use the
+offset normalizing iterators [offsetalator](#offset-normalizing-iterators) to access individual offset values.
+
+A `cudf::string_view` is a view of a single string and therefore
+is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
+data type for a `cudf::column` of type `INT32`. As its name implies, this is a
+read-only object instance that points to device memory inside the strings column.
+Its lifespan is the same (or less) as the column it views.
+An individual strings column row and a `cudf::string_view` is limited to [`size_type`](#cudfsize_type) bytes.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.
 
 ```c++
-   cudf::column_device_view d_strings;
+   cudf::strings_column_view scv;
+   auto d_strings = cudf::column_device_view::create(scv.parent(), stream);
    ...
    if( d_strings.is_valid(row_index) ) {
       string_view d_str = d_strings.element<string_view>(row_index);
@@ -1341,27 +1416,27 @@ column, do not call `element()` on a row that is null.
    }
 ```
 
-A null string is not the same as an empty string. Use the `string_scalar` class if you need an
+A null string is not the same as an empty string. Use the `cudf::string_scalar` class if you need an
 instance of a class object to represent a null string.
 
-The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
-functions like `sort` without string-specific code. The data for a `string_view` instance is
+The `cudf::string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
+functions like `sort` without string-specific code. The data for a `cudf::string_view` instance is
 required to be [UTF-8](#utf-8) and all operators and methods expect this encoding. Unless documented
 otherwise, position and length parameters are specified in characters and not bytes. The class also
-includes a `string_view::const_iterator` which can be used to navigate through individual characters
+includes a `cudf::string_view::const_iterator` which can be used to navigate through individual characters
 within the string.
 
-`cudf::type_dispatcher` dispatches to the `string_view` data type when invoked on a `STRING` column.
+`cudf::type_dispatcher` dispatches to the `cudf::string_view` data type when invoked on a `STRING` column.
 
 #### UTF-8
 
 The libcudf strings column only supports UTF-8 encoding for strings data.
 [UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each
 character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes.
-For this reason, it is recommended to use the `string_view` class to access these characters for
+For this reason, it is recommended to use the `cudf::string_view` class to access these characters for
 most operations.
 
-The `string_view.cuh` header also includes some utility methods for reading and writing
+The `cudf/strings/detail/utf8.hpp` header also includes some utility methods for reading and writing
 (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.
 
 ### cudf::lists_column_view and cudf::lists_view
@@ -1384,3 +1459,25 @@ cuIO is a component of libcudf that provides GPU-accelerated reading and writing
 formats commonly used in data analytics, including CSV, Parquet, ORC, Avro, and JSON_Lines.
 
 // TODO: add more detail and move to a separate file.
+
+# Debugging Tips
+
+Here are some tools that can help with debugging libcudf (besides printf of course):
+1. `cuda-gdb`\
+   Follow the instructions in the [Contributor to cuDF guide](../../../CONTRIBUTING.md#debugging-cudf) to build
+   and run libcudf with debug symbols.
+2. `compute-sanitizer`\
+   The [CUDA Compute Sanitizer](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html)
+   tool can be used to locate many CUDA reported errors by providing a call stack
+   close to where the error occurs even with a non-debug build. The sanitizer includes various
+   tools including `memcheck`, `racecheck`, and `initcheck` as well as others.
+   The `racecheck` and `initcheck` have been known to produce false positives.
+3. `cudf::test::print()`\
+   The `print()` utility can be called within a gtest to output the data in a `cudf::column_view`.
+   More information is available in the [Testing Guide](TESTING.md#printing-and-accessing-column-data)
+4. GCC Address Sanitizer\
+   The GCC ASAN can also be used by adding the `-fsanitize=address` compiler flag.
+   There is a compatibility issue with the CUDA runtime that can be worked around by setting
+   environment variable `ASAN_OPTIONS=protect_shadow_gap=0` before running the executable.
+   Note that the CUDA `compute-sanitizer` can also be used with GCC ASAN by setting the
+   environment variable `ASAN_OPTIONS=protect_shadow_gap=0,alloc_dealloc_mismatch=0`.
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index a4ffe0f575b..9c86be5a55d 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -455,10 +455,19 @@ Column comparison functions in the `cudf::test::detail` namespace should **NOT**
 
 ### Printing and accessing column data
 
-`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing
+The `<cudf_test/debug_utilities.hpp>` header defines various functions and overloads for printing
 columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to
-the host (`to_host`).
-
+the host (`to_host`). For example, to print a `cudf::column_view` contents or `column_wrapper` instance
+to the console use the `cudf::test::print()`:
+```cpp
+  cudf::test::fixed_width_column_wrapper<int32_t> input({1,2,3,4});
+  auto splits = cudf::split(input,{2});
+  cudf::test::print(input);
+  cudf::test::print(splits.front());
+```
+Fixed-width and strings columns output as comma-separated entries including null rows.
+Nested columns are also supported and output includes the offsets and data children as well as
+the null mask bits.
 
 ## Validating Stream Usage
 
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 759a43b5627..a3fe699667a 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(basic_example)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   basic_example
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(basic_example src/process_csv.cpp)
 target_link_libraries(basic_example PRIVATE cudf::cudf)
 target_compile_features(basic_example PRIVATE cxx_std_17)
+
+install(TARGETS basic_example DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 001cdeec694..bde6ef7d69c 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,14 +1,41 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
+set -euo pipefail
+
 # Parallelism control
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+# Installation disabled by default
+INSTALL_EXAMPLES=false
+
+# Check for -i or --install flags to enable installation
+ARGS=$(getopt -o i --long install -- "$@")
+eval set -- "$ARGS"
+while [ : ]; do
+  case "$1" in
+    -i | --install)
+        INSTALL_EXAMPLES=true
+        shift
+        ;;
+    --) shift;
+        break
+        ;;
+  esac
+done
 
 # Root of examples
 EXAMPLES_DIR=$(dirname "$(realpath "$0")")
+
+# Set up default libcudf build directory and install prefix if conda build
+if [ "${CONDA_BUILD:-"0"}" == "1" ]; then
+  LIB_BUILD_DIR="${LIB_BUILD_DIR:-${SRC_DIR/cpp/build}}"
+  INSTALL_PREFIX="${INSTALL_PREFIX:-${PREFIX}}"
+fi
+
+# libcudf build directory
 LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 
 ################################################################################
@@ -23,8 +50,13 @@ build_example() {
   cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
   # Build
   cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+  # Install if needed
+  if [ "$INSTALL_EXAMPLES" = true ]; then
+    cmake --install ${build_dir} --prefix ${INSTALL_PREFIX:-${example_dir}/install}
+  fi
 }
 
 build_example basic
 build_example strings
 build_example nested_types
+build_example parquet_io
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
index a03f84ae142..851405caf55 100644
--- a/cpp/examples/fetch_dependencies.cmake
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -11,7 +11,10 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(CPM_DOWNLOAD_VERSION v0.35.3)
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+set(CPM_DOWNLOAD_VERSION v0.38.5)
 file(
   DOWNLOAD
   https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
@@ -19,9 +22,11 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-24.04)
+# find or build it via CPM
 CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  NAME cudf
+  FIND_PACKAGE_ARGUMENTS "PATHS ${cudf_ROOT} ${cudf_ROOT}/latest" GIT_REPOSITORY
+                         https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
   GIT_SHALLOW
     TRUE
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
index cb9430db237..8a900f6b5ae 100644
--- a/cpp/examples/nested_types/CMakeLists.txt
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(nested_types)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   nested_types
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(deduplication deduplication.cpp)
 target_link_libraries(deduplication PRIVATE cudf::cudf)
 target_compile_features(deduplication PRIVATE cxx_std_17)
+
+install(TARGETS deduplication DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
new file mode 100644
index 00000000000..d8e9205ffd4
--- /dev/null
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(parquet_io)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  parquet_io
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(parquet_io parquet_io.cpp)
+target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_compile_features(parquet_io PRIVATE cxx_std_17)
+
+install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/example.parquet b/cpp/examples/parquet_io/example.parquet
new file mode 100644
index 00000000000..f0fb5319cb0
Binary files /dev/null and b/cpp/examples/parquet_io/example.parquet differ
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
new file mode 100644
index 00000000000..8be17db3781
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parquet_io.hpp"
+
+/**
+ * @file parquet_io.cpp
+ * @brief Demonstrates usage of the libcudf APIs to read and write
+ * parquet file format with different encodings and compression types
+ *
+ * The following encoding and compression ztypes are demonstrated:
+ * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
+ *                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
+ *
+ * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
+ *
+ */
+
+/**
+ * @brief Read parquet input from file
+ *
+ * @param filepath path to input parquet file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_parquet(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::parquet_reader_options::builder(source_info);
+  auto options     = builder.build();
+  return cudf::io::read_parquet(options);
+}
+
+/**
+ * @brief Write parquet output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by parquet reader
+ * @param filepath path to output parquet file
+ * @param stats_level optional page size stats level
+ */
+void write_parquet(cudf::table_view input,
+                   cudf::io::table_metadata metadata,
+                   std::string filepath,
+                   cudf::io::column_encoding encoding,
+                   cudf::io::compression_type compression,
+                   std::optional<cudf::io::statistics_freq> stats_level)
+{
+  // write the data for inspection
+  auto sink_info      = cudf::io::sink_info(filepath);
+  auto builder        = cudf::io::parquet_writer_options::builder(sink_info, input);
+  auto table_metadata = cudf::io::table_input_metadata{metadata};
+
+  std::for_each(table_metadata.column_metadata.begin(),
+                table_metadata.column_metadata.end(),
+                [=](auto& col_meta) { col_meta.set_encoding(encoding); });
+
+  builder.metadata(table_metadata);
+  auto options = builder.build();
+  options.set_compression(compression);
+  // Either use the input stats level or don't write stats
+  options.set_stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
+
+  // write parquet data
+  cudf::io::write_parquet(options);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. parquet input file name/path (default: "example.parquet")
+ * 2. parquet output file name/path (default: "output.parquet")
+ * 3. encoding type for columns (default: "DELTA_BINARY_PACKED")
+ * 4. compression type (default: "ZSTD")
+ * 5. optional: use page size stats metadata (default: "NO")
+ *
+ * Example invocation from directory `cudf/cpp/examples/parquet_io`:
+ * ./build/parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  cudf::io::column_encoding encoding;
+  cudf::io::compression_type compression;
+  std::optional<cudf::io::statistics_freq> page_stats;
+
+  switch (argc) {
+    case 1:
+      input_filepath  = "example.parquet";
+      output_filepath = "output.parquet";
+      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
+      compression     = get_compression_type("ZSTD");
+      break;
+    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
+    case 5:
+      input_filepath  = argv[1];
+      output_filepath = argv[2];
+      encoding        = get_encoding_type(argv[3]);
+      compression     = get_compression_type(argv[4]);
+      break;
+    default:
+      throw std::runtime_error(
+        "Either provide all command-line arguments, or none to use defaults\n");
+  }
+
+  // Create and use a memory pool
+  bool is_pool_used = true;
+  auto resource     = create_memory_resource(is_pool_used);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // Read input parquet file
+  // We do not want to time the initial read time as it may include
+  // time for nvcomp, cufile loading and RMM growth
+  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
+  std::cout << "Note: Not timing the initial parquet read as it may include\n"
+               "times for nvcomp, cufile loading and RMM growth."
+            << std::endl
+            << std::endl;
+  auto [input, metadata] = read_parquet(input_filepath);
+
+  // Status string to indicate if page stats are set to be written or not
+  auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
+  // Write parquet file with the specified encoding and compression
+  std::cout << "Writing " << output_filepath << " with encoding, compression and "
+            << page_stat_string << ".." << std::endl;
+
+  // `timer` is automatically started here
+  Timer timer;
+  write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
+  timer.print_elapsed_millis();
+
+  // Read the parquet file written with encoding and compression
+  std::cout << "Reading " << output_filepath << "..." << std::endl;
+
+  // Reset the timer
+  timer.reset();
+  auto [transcoded_input, transcoded_metadata] = read_parquet(output_filepath);
+  timer.print_elapsed_millis();
+
+  // Check for validity
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(
+      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    std::cout << "Transcoding valid: false" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
new file mode 100644
index 00000000000..d2fc359a2fe
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <optional>
+#include <string>
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (is_pool_used) {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      cuda_mr, rmm::percent_of_free_device_memory(50));
+  }
+  return cuda_mr;
+}
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+{
+  using encoding_type = cudf::io::column_encoding;
+
+  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+    {"DEFAULT", encoding_type::USE_DEFAULT},
+    {"DICTIONARY", encoding_type::DICTIONARY},
+    {"PLAIN", encoding_type::PLAIN},
+    {"DELTA_BINARY_PACKED", encoding_type::DELTA_BINARY_PACKED},
+    {"DELTA_LENGTH_BYTE_ARRAY", encoding_type::DELTA_LENGTH_BYTE_ARRAY},
+    {"DELTA_BYTE_ARRAY", encoding_type::DELTA_BYTE_ARRAY},
+  };
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid encoding type.\n\n"
+                              "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
+                              "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
+                              "DELTA_BYTE_ARRAY\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+{
+  using compression_type = cudf::io::compression_type;
+
+  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+    {"NONE", compression_type::NONE},
+    {"AUTO", compression_type::AUTO},
+    {"SNAPPY", compression_type::SNAPPY},
+    {"LZ4", compression_type::LZ4},
+    {"ZSTD", compression_type::ZSTD}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid compression type.\n\n"
+                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get the optional page size stat frequency from they keyword
+ *
+ * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
+ */
+[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+{
+  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+
+  // Check if the input string matches to any of the following
+  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
+      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
+    // Full column and offset indices - STATISTICS_COLUMN
+    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Light-weight timer for parquet reader and writer instrumentation
+ *
+ * Timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. Timer starts at object construction.
+ */
+class Timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  Timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};
diff --git a/cpp/examples/set_cuda_architecture.cmake b/cpp/examples/set_cuda_architecture.cmake
new file mode 100644
index 00000000000..bed6cd2f357
--- /dev/null
+++ b/cpp/examples/set_cuda_architecture.cmake
@@ -0,0 +1,28 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/${CUDF_TAG}/RAPIDS.cmake
+       ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake
+  )
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index c90fa9dde16..a5654870544 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(strings_examples)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   strings_examples
   VERSION 0.0.1
@@ -12,22 +18,27 @@ include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
-#
 add_executable(libcudf_apis libcudf_apis.cpp)
 target_compile_features(libcudf_apis PRIVATE cxx_std_17)
 target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf)
 
 add_executable(custom_with_malloc custom_with_malloc.cu)
 target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
 target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_prealloc custom_prealloc.cu)
 target_compile_features(custom_prealloc PRIVATE cxx_std_17)
 target_compile_options(custom_prealloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_optimized custom_optimized.cu)
 target_compile_features(custom_optimized PRIVATE cxx_std_17)
 target_compile_options(custom_optimized PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_optimized DESTINATION bin/examples/libcudf)
+
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 0dbe6fe2b7b..65a9c100c7c 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -110,7 +111,8 @@ int main(int argc, char const** argv)
 
   std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
   std::cout << "Wall time: " << elapsed.count() << " seconds\n";
-  std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
+  auto const scv = cudf::strings_column_view(result->view());
+  std::cout << "Output size " << scv.chars_size(rmm::cuda_stream_default) << " bytes\n";
 
   return 0;
 }
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index cefa3346150..62ca19a5ca9 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -153,8 +153,12 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
     *d_names, *d_visibilities, offsets.data(), chars.data());
 
-  // create column from offsets and chars vectors (no copy is performed)
-  auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
+  // create column from offsets vector (move only)
+  auto offsets_column = std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // create column for chars vector (no copy is performed)
+  auto result = cudf::make_strings_column(
+    names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 
   // wait for all of the above to finish
   stream.synchronize();
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
new file mode 100644
index 00000000000..dff66b4d7d8
--- /dev/null
+++ b/cpp/examples/versions.cmake
@@ -0,0 +1,15 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(CUDF_TAG branch-24.06)
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a36a831a7aa..38f7ac5291f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/scan.h>
 
 #include <functional>
@@ -118,7 +120,7 @@ class expression_parser {
                     std::optional<std::reference_wrapper<cudf::table_view const>> right,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : _left{left},
       _right{right},
       _expression_count{0},
@@ -139,7 +141,7 @@ class expression_parser {
                     cudf::table_view const& table,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : expression_parser(expr, table, {}, has_nulls, stream, mr)
   {
   }
@@ -240,7 +242,7 @@ class expression_parser {
     data_pointers.push_back(v.data());
   }
 
-  void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 9df4b4eb00f..5e41a871f32 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -76,6 +77,8 @@ enum class binary_operator : int32_t {
   GREATER_EQUAL,         ///< operator >=
   NULL_EQUALS,           ///< Returns true when both operands are null; false when one is null; the
                          ///< result of equality when both are non-null
+  NULL_NOT_EQUALS,       ///< Returns false when both operands are null; true when one is null; the
+                         ///< result of inequality when both are non-null
   NULL_MAX,              ///< Returns max of operands when both are non-null; returns the non-null
                          ///< operand when one is null; or invalid when both are null
   NULL_MIN,              ///< Returns min of operands when both are non-null; returns the non-null
@@ -116,8 +119,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between a column and a scalar.
@@ -147,8 +150,8 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns.
@@ -177,8 +180,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns using a
@@ -208,8 +211,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
@@ -249,8 +252,8 @@ namespace binops {
 std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace compiled {
 namespace detail {
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 023e58c5300..22db25bdc83 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -24,6 +24,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <type_traits>
@@ -63,8 +64,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Move the contents from `other` to create a new column.
@@ -141,8 +142,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the column's logical element type
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 96322159f0f..dc4700576e6 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -75,9 +76,9 @@ std::unique_ptr<column> make_empty_column(type_id id);
 std::unique_ptr<column> make_numeric_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -102,8 +103,8 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   return std::make_unique<column>(type,
@@ -133,9 +134,9 @@ std::unique_ptr<column> make_numeric_column(
 std::unique_ptr<column> make_fixed_point_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -159,8 +160,8 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   return std::make_unique<column>(type,
@@ -191,9 +192,9 @@ std::unique_ptr<column> make_fixed_point_column(
 std::unique_ptr<column> make_timestamp_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -218,8 +219,8 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   return std::make_unique<column>(type,
@@ -250,9 +251,9 @@ std::unique_ptr<column> make_timestamp_column(
 std::unique_ptr<column> make_duration_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -277,8 +278,8 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   return std::make_unique<column>(type,
@@ -309,9 +310,9 @@ std::unique_ptr<column> make_duration_column(
 std::unique_ptr<column> make_fixed_width_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -336,8 +337,8 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
   if (is_timestamp(type)) {
@@ -375,8 +376,8 @@ std::unique_ptr<column> make_fixed_width_column(
  */
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<char const*, size_type> const> strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given a device span of string_view.
@@ -407,8 +408,8 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   string_view const null_placeholder,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
@@ -495,8 +496,8 @@ std::unique_ptr<cudf::column> make_lists_column(
   std::unique_ptr<column> child_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRUCT column using specified child columns as members.
@@ -526,8 +527,8 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a column with size elements that are all equal to the given scalar.
@@ -546,8 +547,8 @@ std::unique_ptr<cudf::column> make_structs_column(
 std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column with size elements that are all equal to the given scalar.
@@ -566,8 +567,8 @@ std::unique_ptr<column> make_column_from_scalar(
 std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 9ee55275a5e..e7b55a2e6d0 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -46,8 +47,8 @@ namespace cudf {
  */
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates multiple columns into a single column
@@ -63,8 +64,8 @@ rmm::device_buffer concatenate_masks(
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
@@ -92,8 +93,8 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 1bbbf73bd5d..0d4f20d1ef2 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -119,7 +121,7 @@ struct packed_table {
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace detail {
 struct contiguous_split_state;
@@ -196,7 +198,7 @@ class chunked_pack {
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor that will be implemented as default. Declared with definition here because
@@ -261,7 +263,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
  private:
   // internal state of contiguous split
@@ -281,7 +283,7 @@ class chunked_pack {
  *         and device memory respectively
  */
 packed_columns pack(cudf::table_view const& input,
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index b2cde82fada..b17cafb05ab 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -25,6 +25,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -84,9 +85,9 @@ enum class out_of_bounds_policy : bool {
 std::unique_ptr<table> gather(
   table_view const& source_table,
   column_view const& gather_map,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the rows within a table.
@@ -105,8 +106,8 @@ std::unique_ptr<table> gather(
  */
 std::unique_ptr<table> reverse(
   table_view const& source_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the elements of a column
@@ -125,8 +126,8 @@ std::unique_ptr<table> reverse(
  */
 std::unique_ptr<column> reverse(
   column_view const& source_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -174,8 +175,8 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -217,8 +218,8 @@ std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<scalar const>> const& source,
   column_view const& indices,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates when to allocate a mask, based on an existing mask.
@@ -253,6 +254,8 @@ std::unique_ptr<column> empty_like(scalar const& input);
  * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
  * and the validity bits and the null count should be set by the caller.
  *
+ * @throws cudf::data_type_error if input type is not of fixed width.
+ *
  * @param input Immutable view of input column to emulate
  * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -262,9 +265,9 @@ std::unique_ptr<column> empty_like(scalar const& input);
  */
 std::unique_ptr<column> allocate_like(
   column_view const& input,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an uninitialized new column of the specified size and same type as the `input`.
@@ -285,9 +288,9 @@ std::unique_ptr<column> allocate_like(
 std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a table of empty columns with the same types as the `input_table`
@@ -360,6 +363,7 @@ void copy_range_in_place(column_view const& source,
  *
  * @throws std::out_of_range for any invalid range.
  * @throws cudf::data_type_error if @p target and @p source have different types.
+ * @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary
  *
  * @param source The column to copy from inside the range
  * @param target The column to copy from outside the range
@@ -377,8 +381,8 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a new column by shifting all values by an offset.
@@ -421,8 +425,8 @@ std::unique_ptr<column> shift(
   column_view const& input,
   size_type offset,
   scalar const& fill_value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
@@ -624,8 +628,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -650,8 +654,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -676,8 +680,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -700,8 +704,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters rows from the input table to rows of the output corresponding
@@ -744,8 +748,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters scalar values to rows of the output corresponding
@@ -783,8 +787,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Get the element at specified index from a column
@@ -803,8 +807,8 @@ std::unique_ptr<table> boolean_mask_scatter(
 std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
@@ -848,7 +852,7 @@ std::unique_ptr<table> sample(
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if a column or its descendants have non-empty null rows
@@ -964,8 +968,8 @@ bool may_have_nonempty_nulls(column_view const& input);
  */
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 44736ca0762..06b7d24f6cd 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -47,7 +48,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts month from any datetime type and returns an int16_t
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts day from any datetime type and returns an int16_t
@@ -75,7 +76,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts a weekday from any datetime type and returns an int16_t
@@ -89,7 +90,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts hour from any datetime type and returns an int16_t
@@ -103,7 +104,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts minute from any datetime type and returns an int16_t
@@ -117,7 +118,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts second from any datetime type and returns an int16_t
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
@@ -148,7 +149,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
@@ -165,7 +166,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
@@ -182,7 +183,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the day number since the start of the year from the datetime and
@@ -217,7 +218,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -252,7 +253,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -287,7 +288,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Check if the year of the given date is a leap year
@@ -304,7 +305,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Extract the number of days in the month
@@ -320,7 +321,7 @@ std::unique_ptr<cudf::column> is_leap_year(
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Returns the quarter of the date
@@ -336,7 +337,7 @@ std::unique_ptr<cudf::column> days_in_month(
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
@@ -365,7 +366,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes down to the nearest multiple of the given frequency.
@@ -380,7 +381,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes to the nearest multiple of the given frequency.
@@ -395,7 +396,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index e5609568d10..de1fde8bc96 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -26,7 +27,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * std::string const&, data_type, rmm::mr::device_memory_resource *)
+ * std::string const&, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,11 +36,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,11 +49,11 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,11 +62,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -74,6 +75,6 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 59fb6758973..a9cf54e29b8 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ struct calendrical_month_sequence_functor {
     scalar const& input,
     size_type months,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 442814bc4fd..3e039175542 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -29,22 +30,22 @@ namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
- * @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<column_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<table_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index e7086ea17a5..dd2fb471a7d 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -59,13 +60,13 @@ size_type concatenate_masks(host_span<column_view const> views,
                             rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index d9a35470b7d..de00b61cdca 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ namespace detail {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::pack
@@ -42,7 +43,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
  **/
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr);
+                    rmm::device_async_resource_ref mr);
 
 // opaque implementation of `metadata_builder` since it needs to use
 // `serialized_column`, which is only defined in pack.cpp
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 115822163c3..f7430eb090d 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <initializer_list>
 
@@ -123,7 +124,7 @@ std::vector<table_view> split(table_view const& input,
 
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -131,7 +132,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs segmented shifts for specified values.
@@ -171,11 +172,11 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -183,11 +184,11 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -195,11 +196,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -207,11 +208,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -219,11 +220,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -231,7 +232,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sample
@@ -243,7 +244,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::get_element
@@ -253,7 +254,7 @@ std::unique_ptr<table> sample(table_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::has_nonempty_nulls
@@ -276,7 +277,7 @@ bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view str
  */
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 3af050a5da6..c98057d077a 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -37,6 +37,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -239,7 +240,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type per_thread,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -286,7 +287,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
@@ -325,7 +326,7 @@ template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
                                Filter filter,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 6162fa5ecf1..8418e279ce7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/optional.h>
@@ -44,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  size_type const tid            = threadIdx.x + blockIdx.x * block_size;
-  int const warp_id              = tid / warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / warp_size;
+  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
+  int const warp_id              = tidx / cudf::detail::warp_size;
+  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
 
   // begin/end indices for the column data
-  size_type begin = 0;
-  size_type end   = out.size();
+  size_type const begin = 0;
+  size_type const end   = out.size();
   // warp indices.  since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
   // each warp will process one (32 bit) of the validity mask via
   // __ballot_sync()
-  size_type warp_begin = cudf::word_index(begin);
-  size_type warp_end   = cudf::word_index(end - 1);
+  size_type const warp_begin = cudf::word_index(begin);
+  size_type const warp_end   = cudf::word_index(end - 1);
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
   // current warp.
   size_type warp_cur = warp_begin + warp_id;
-  size_type index    = tid;
   while (warp_cur <= warp_end) {
+    auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
       (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
@@ -84,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
     // next grid
     warp_cur += warps_per_grid;
-    index += block_size * gridDim.x;
+    tidx += stride;
   }
 
   if (has_nulls) {
@@ -152,13 +154,13 @@ std::unique_ptr<column> copy_if_else(bool nullable,
                                      FilterFn filter,
                                      cudf::data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   // This is the type of the thrust::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
-  size_type num_els        = cudf::util::round_up_safe(size, warp_size);
+  size_type num_els        = cudf::util::round_up_safe(size, cudf::detail::warp_size);
   constexpr int block_size = 256;
   cudf::detail::grid_1d grid{num_els, block_size, 1};
 
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 9f8b0f8b619..1b3b2056c6c 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -203,7 +204,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index c5160958165..a93c06d4371 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,156 +19,158 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 
 namespace cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace datetime
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index e874151ed36..de3d23e9470 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 
@@ -84,16 +85,10 @@ struct hasher_adapter {
 template <cudf::has_nested HasNested>
 struct distinct_hash_join {
  private:
-  /// Row equality type for nested columns
-  using nested_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<true, cudf::nullate::DYNAMIC>>;
-  /// Row equality type for flat columns
-  using flat_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<false, cudf::nullate::DYNAMIC>>;
-
   /// Device row equal type
-  using d_equal_type =
-    std::conditional_t<HasNested == cudf::has_nested::YES, nested_row_equal, flat_row_equal>;
+  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
+    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
+                                                             cudf::nullate::DYNAMIC>>;
   using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
   using probing_scheme_type = cuco::linear_probing<1, hasher>;
   using cuco_storage_type   = cuco::storage<1>;
@@ -148,12 +143,12 @@ struct distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+  inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index caaccfb4851..6996cda6974 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,7 +49,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 6492aa23e80..c9d350ce983 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -38,6 +38,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -174,7 +175,7 @@ struct column_gatherer {
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     column_gatherer_impl<Element> gatherer{};
 
@@ -214,7 +215,7 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
     auto const policy   = cudf::mask_allocation_policy::NEVER;
@@ -260,7 +261,7 @@ struct column_gatherer_impl<string_view> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (true == nullify_out_of_bounds) {
       return cudf::strings::detail::gather<true>(
@@ -334,7 +335,7 @@ struct column_gatherer_impl<list_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     lists_column_view list(column);
     auto gather_map_size = std::distance(gather_map_begin, gather_map_end);
@@ -397,7 +398,7 @@ struct column_gatherer_impl<dictionary32> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     dictionary_column_view dictionary(source_column);
     auto output_count = std::distance(gather_map_begin, gather_map_end);
@@ -448,7 +449,7 @@ struct column_gatherer_impl<struct_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const gather_map_size = std::distance(gather_map_begin, gather_map_end);
     if (gather_map_size == 0) { return empty_like(column); }
@@ -554,7 +555,7 @@ void gather_bitmask(table_view const& source,
                     std::vector<std::unique_ptr<column>>& target,
                     gather_bitmask_op op,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   if (target.empty()) { return; }
 
@@ -652,7 +653,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               MapIterator gather_map_end,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 034eb6c1282..36824f56895 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,12 +67,12 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
  * const&,cudf::out_of_bounds_policy,cudf::detail::negative_index_policy,rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
  */
@@ -80,7 +81,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 0afa69be1a3..5a8c9b0a27f 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -45,7 +46,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace hash
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index e081a626c75..389c7952875 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 7b386eb5f03..567efedb9b2 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -87,7 +88,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(column_view const& values,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -101,7 +102,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(column_view const& values,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted unique keys
@@ -109,7 +110,7 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted keys
@@ -117,7 +118,7 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get the number of groups in `keys`
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 1df6848c575..dfe79646167 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_map.cuh>
 #include <thrust/for_each.h>
@@ -124,7 +125,7 @@ rmm::device_uvector<OutputType> hash_reduce_by_row(
   ReduceFuncBuilder func_builder,
   OutputType init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const map_dview  = map.get_device_view();
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 296b68d22a9..5b2b9b5e69d 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -47,7 +49,7 @@ namespace detail {
  */
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::to_dlpack
@@ -56,7 +58,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
  */
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
+                           rmm::device_async_resource_ref mr);
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -127,19 +129,19 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         arrow::MemoryPool* ar_mr);
 /**
  * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return a maximum precision for a given type.
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 27d14874bce..aabfff746ea 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_multimap.cuh>
 
@@ -105,7 +106,7 @@ struct hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const;
+             rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::left_join
@@ -115,7 +116,7 @@ struct hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::full_join
@@ -125,7 +126,7 @@ struct hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::inner_join_size
@@ -144,7 +145,7 @@ struct hash_join {
    */
   std::size_t full_join_size(cudf::table_view const& probe,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const;
+                             rmm::device_async_resource_ref mr) const;
 
  private:
   /**
@@ -169,7 +170,7 @@ struct hash_join {
                      join_kind join,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const;
+                     rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::detail::hash_join::probe_join_indices
@@ -184,7 +185,7 @@ struct hash_join {
                     join_kind join,
                     std::optional<std::size_t> output_size,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr) const;
+                    rmm::device_async_resource_ref mr) const;
 };
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 50eeba58cdd..9f6dcce448d 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -40,7 +41,7 @@ namespace detail {
 /**
  * @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive
  * left_inclusive, column_view const& right_edges, inclusive right_inclusive, rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  *
  * @param stream Stream view on which to allocate resources and queue execution.
  */
@@ -50,7 +51,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 2167a484214..837eda0d7b5 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -45,7 +46,7 @@ using index_vector = rmm::device_uvector<index_type>;
  *            std::vector<cudf::size_type> const& key_cols,
  *            std::vector<cudf::order> const& column_order,
  *            std::vector<cudf::null_order> const& null_precedence,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 8f90afc3e57..32df13104e0 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,8 +204,8 @@ struct alignas(16) base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("only integral types are supported");
@@ -213,8 +213,8 @@ struct alignas(16) base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
+    template <typename T, CUDF_ENABLE_IF(cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const noexcept
     {
       return sizeof(T);
     }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index db373f47a01..e62675cbc8c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
@@ -110,7 +111,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
 
 /**
  * @copydoc bitmask_binop(Binop op, host_span<bitmask_type const* const>, host_span<size_type>
- * const, size_type, rmm::mr::device_memory_resource *)
+ * const, size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -120,7 +121,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_binop(Binop op,
                                                        host_span<size_type const> masks_begin_bits,
                                                        size_type mask_size_bits,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
   auto null_count =
@@ -163,7 +164,7 @@ size_type inplace_bitmask_binop(Binop op,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
@@ -282,7 +283,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                                     OffsetIterator last_bit_indices_begin,
                                                     count_bits_policy count_bits,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto const num_ranges =
     static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
@@ -541,7 +542,7 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
   null_policy null_handling,
   std::optional<bool> valid_initial_value,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const segments_begin =
     thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 74e2ccd2ea1..04d8d663acb 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -28,14 +29,14 @@ namespace cudf {
 namespace detail {
 
 /**
- * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
@@ -194,7 +195,7 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
 
 /**
  * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type,
- *rmm::mr::device_memory_resource*)
+ *rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -202,20 +203,20 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::copy_bitmask(column_view const& view, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const* const>, host_span<size_type> const,
- * size_type, rmm::mr::device_memory_resource *)
+ * size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -223,7 +224,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> masks_begin_bits,
                                                      size_type mask_size_bits,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_and
@@ -232,7 +233,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_or
@@ -241,7 +242,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -274,7 +275,7 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index ac37d923d85..6c188d2ca68 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -35,7 +36,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& ordered_indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::quantiles()
@@ -49,18 +50,18 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::percentile_approx(tdigest_column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> percentile_approx(tdigest::tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 883d5d158fb..abb9e45a95c 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -28,7 +29,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::repeat(table_view const&, column_view const&, bool,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,18 +37,18 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               bool check_count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index da83f7b285d..46203bdf2f0 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,58 +28,58 @@ namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::replace_nulls(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       cudf::column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, replace_policy const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       replace_policy const& replace_policy,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::find_and_replace_all
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
                                              column_view const& values_to_replace,
                                              column_view const& replacement_values,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::normalize_nans_and_zeros
@@ -98,7 +99,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 5ab53690a23..7a1c3d6c4f0 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
                             rmm::cuda_stream_view,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::interleave_columns
@@ -42,7 +43,7 @@ std::unique_ptr<table> tile(table_view const& input,
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index da90217c254..ea6f38c421c 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -35,7 +36,7 @@ namespace detail {
  *            column_view const& following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -45,7 +46,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index cdfc7caef37..1a9c5c82c65 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -27,7 +28,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::round(column_view const&, int32_t, rounding_method,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index f4b2d51d0cb..54c25d0157c 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -50,7 +51,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the inclusive scan of a column.
@@ -76,7 +77,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ranks for a column.
@@ -88,7 +89,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row dense ranks for a column.
@@ -100,7 +101,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ONE_NORMALIZED percent ranks for a column.
@@ -113,7 +114,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
  * @return rank values.
  */
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index dbf7bfa9527..80bc87731ca 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,10 +29,13 @@
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -145,7 +148,7 @@ struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Elem
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
@@ -170,7 +173,7 @@ struct column_scatterer_impl<string_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto d_column    = column_device_view::create(source, stream);
     auto const begin = d_column->begin<string_view>();
@@ -187,7 +190,7 @@ struct column_scatterer_impl<list_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return cudf::lists::detail::scatter(
       source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -202,7 +205,7 @@ struct column_scatterer_impl<dictionary32> {
                                      MapIterator scatter_map_end,
                                      column_view const& target_in,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (target_in.is_empty())  // empty begets empty
       return make_empty_column(type_id::DICTIONARY32);
@@ -212,8 +215,9 @@ struct column_scatterer_impl<dictionary32> {
     // check the keys match
     dictionary_column_view const source(source_in);
     dictionary_column_view const target(target_in);
-    CUDF_EXPECTS(source.keys().type() == target.keys().type(),
-                 "scatter dictionary keys must be the same type");
+    CUDF_EXPECTS(cudf::have_same_types(source.keys(), target.keys()),
+                 "scatter dictionary keys must be the same type",
+                 cudf::data_type_error);
 
     // first combine keys so both dictionaries have the same set
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
@@ -261,7 +265,7 @@ struct column_scatterer {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scatterer_impl<Element> scatterer{};
     return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -276,7 +280,7 @@ struct column_scatterer_impl<struct_view> {
                                      MapItRoot scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.num_children() == target.num_children(),
                  "Scatter source and target are not of the same type.");
@@ -391,7 +395,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                MapIterator scatter_map_end,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 94c795f31b2..95ed6af8c3c 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,11 +64,11 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
- * const&,bool,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * const&,bool,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
@@ -75,7 +76,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -110,13 +111,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
                       table_view const& source, table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -124,14 +125,14 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
  *                    std::vector<std::reference_wrapper<scalar>> const& source,
  *                    table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -140,7 +141,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4277baf3edd..e60b18f4c8d 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 /**
@@ -35,7 +36,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::upper_bound
@@ -47,24 +48,24 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, scalar const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Check if rows in the given `needles` table exist in the `haystack` table.
@@ -96,6 +97,6 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 6f2a43b54de..a18a9d3b200 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -21,12 +21,13 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
- *                                       rmm::mr::device_memory_resource* mr =
+ *                                       rmm::device_async_resource_ref mr =
  *rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -35,11 +36,11 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
-                                         rmm::mr::device_memory_resource* mr =
+                                         rmm::device_async_resource_ref mr =
  rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -47,13 +48,13 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
  *                                           scalar const& init,
  *                                           size_type months,
- *                                           rmm::mr::device_memory_resource* mr)
+ *                                           rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 08917bfce24..63e4fca8915 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -300,7 +301,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto count          = static_cast<size_type>(std::distance(begin, end));
   auto offsets_column = make_numeric_column(
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 97cc054da57..4ddba38a7e9 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -37,7 +38,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sorted_order
@@ -48,7 +49,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort_by_key
@@ -60,7 +61,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rank
@@ -74,7 +75,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort_by_key
@@ -86,7 +87,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sorted_order
@@ -98,7 +99,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sorted_order
@@ -111,7 +112,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sort_by_key
@@ -124,7 +125,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sort_by_key
@@ -137,7 +138,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort
@@ -148,7 +149,7 @@ std::unique_ptr<table> sort(table_view const& values,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort
@@ -159,7 +160,7 @@ std::unique_ptr<table> stable_sort(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 7f366c06a1c..e2974789ea1 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,12 +23,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::mr::device_memory_resource*)
+ *                           cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,11 +37,11 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::mr::device_memory_resource*)
+ *                          cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,7 +49,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::apply_boolean_mask
@@ -58,7 +59,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique
@@ -70,7 +71,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct
@@ -83,7 +84,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_distinct
@@ -96,7 +97,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct_indices
@@ -108,7 +109,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index c0a79142cef..e736514ac29 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::structs::detail {
 
@@ -175,7 +176,7 @@ class flattened_table {
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Superimpose nulls from a given null mask into the input column, using bitwise AND.
@@ -197,7 +198,7 @@ class flattened_table {
                                                         size_type null_count,
                                                         std::unique_ptr<column>&& input,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr);
+                                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from the given input column into its children columns, using bitwise AND.
@@ -222,7 +223,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<column_view, temporary_nullable_data> push_down_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from columns of the input table into their children columns, using
@@ -249,7 +250,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<table_view, temporary_nullable_data> push_down_nulls(
-  table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Checks if a column or any of its children is a struct column with structs that are null.
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index b529d4a2c53..bfd12c18fff 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace tdigest {
@@ -70,7 +71,7 @@ std::unique_ptr<column> group_tdigest(column_view const& values,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges tdigests within the same group to generate a new tdigest.
@@ -113,7 +114,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& values,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a tdigest column from its constituent components.
@@ -139,7 +140,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest column.
@@ -152,7 +153,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
  * @returns An empty tdigest column.
  */
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest scalar.
@@ -165,7 +166,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate a tdigest scalar from a set of numeric input values.
@@ -199,7 +200,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges multiple tdigest columns to generate a new tdigest scalar.
@@ -233,7 +234,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace tdigest
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index f7f97c0a7c2..037164aa297 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,13 @@
 #include <cudf/timezone.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -31,6 +32,6 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 965fea84860..47e13fa2e5e 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -34,7 +35,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::compute_column
@@ -44,7 +45,7 @@ std::unique_ptr<column> transform(column_view const& input,
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -52,7 +53,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::encode
@@ -68,7 +69,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::one_hot_encode
@@ -78,7 +79,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr);
+                                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::mask_to_bools
@@ -89,7 +90,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::row_bit_count
@@ -98,7 +99,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
  */
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_row_bit_count
@@ -108,7 +109,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index d0be51860b2..1f8effc8103 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ namespace detail {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 12f864de572..5245cfdf079 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -50,7 +51,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
                                 size_type size,
                                 Predicate p,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,14 +69,14 @@ std::unique_ptr<column> true_if(InputIterator begin,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_valid
  */
 std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::cast
@@ -83,21 +84,21 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_nan
  */
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_not_nan
  */
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 86c85ca8d06..f1775c6d6d7 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -93,6 +93,19 @@ class grid_1d {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
 
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The global thread index
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
+  }
+
   /**
    * @brief Returns the stride of a 1D grid.
    *
@@ -115,6 +128,19 @@ class grid_1d {
    * @return thread_index_type The number of threads in the grid.
    */
   static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type grid_stride()
+  {
+    return grid_stride(num_threads_per_block, gridDim.x);
+  }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
index 858501877b0..6901a19473e 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
@@ -109,30 +109,6 @@ class rmm_host_allocator {
   {
   }
 
-  /**
-   * @brief Copy constructor
-   */
-  rmm_host_allocator(rmm_host_allocator const& other) = default;
-
-  /**
-   * @brief Move constructor
-   */
-  rmm_host_allocator(rmm_host_allocator&& other) = default;
-
-  /**
-   * @brief Assignment operator
-   */
-  rmm_host_allocator& operator=(rmm_host_allocator const& other)
-  {
-    mr = other.mr;
-    return *this;
-  }
-
-  /**
-   * @brief rmm_host_allocator's null destructor does nothing.
-   */
-  inline ~rmm_host_allocator() {}
-
   /**
    * @brief This method allocates storage for objects in host memory.
    *
@@ -183,7 +159,10 @@ class rmm_host_allocator {
    *  @param x The other \p rmm_host_allocator of interest.
    *  @return This method always returns \c true.
    */
-  inline bool operator==(rmm_host_allocator const& x) const { return x.mr == mr; }
+  inline bool operator==(rmm_host_allocator const& x) const
+  {
+    return x.mr == mr && x.stream == stream;
+  }
 
   /**
    * @brief This method tests this \p rmm_host_allocator for inequality
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 19ef26a10cb..e19cc3ec2f7 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,6 +81,11 @@ class cuda_stream_pool {
  */
 cuda_stream_pool* create_global_cuda_stream_pool();
 
+/**
+ * @brief Get the global stream pool.
+ */
+cuda_stream_pool& global_cuda_stream_pool();
+
 /**
  * @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
  * stream.
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 90ad98741ad..293a4096c57 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -50,7 +51,7 @@ namespace detail {
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -71,7 +72,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,7 +95,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -123,7 +124,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -143,7 +144,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -172,7 +173,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -193,7 +194,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -218,7 +219,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -238,7 +239,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -263,7 +264,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index d0073177445..64a3c4edf78 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 
@@ -49,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  auto i            = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto i            = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
@@ -90,7 +91,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
                                                   InputIterator end,
                                                   Predicate p,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index d74429484ce..55f3825b3ec 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 2aad7dd80ed..3b5a3bbab56 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -54,7 +55,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -73,7 +74,7 @@ std::unique_ptr<column> encode(column_view const& column,
  */
 std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return minimal integer type for the given number of elements.
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index cad495d0097..c4229690ff5 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,7 +45,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 0778baa84d6..81a91d57169 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a new dictionary column by replacing nulls with a
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 62059306b9a..2563b96b214 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -27,14 +28,14 @@ namespace detail {
 
 /**
  * @copydoc cudf::dictionary::get_index(dictionary_column_view const&,scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the index for a key if it were added to the given dictionary.
@@ -58,7 +59,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 6fd743ad526..e8486a80afc 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -35,7 +36,7 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -46,7 +47,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -56,7 +57,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
  */
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -67,7 +68,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc
@@ -78,7 +79,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -100,9 +101,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  * @return New dictionary columns and updated cudf::table_views.
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 821981ad148..7cdfa3bf9e5 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -65,8 +66,8 @@ namespace cudf {
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -117,8 +118,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 959b785bf87..768e2be2b0d 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,9 +60,9 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type              = data_type{type_id::UINT32},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type indices_type            = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -80,8 +81,8 @@ std::unique_ptr<column> encode(
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 1b72cf42acd..1dff6dc1d5d 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,8 +45,8 @@ namespace dictionary {
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 40504c22edd..ce7057359a1 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,8 +60,8 @@ namespace dictionary {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing the specified keys
@@ -91,8 +92,8 @@ std::unique_ptr<column> add_keys(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing any keys
@@ -113,8 +114,8 @@ std::unique_ptr<column> remove_keys(
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by applying only the specified keys
@@ -147,8 +148,8 @@ std::unique_ptr<column> remove_unused_keys(
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -163,8 +164,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 1268f488919..90139e8634a 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -91,8 +92,8 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -125,8 +126,8 @@ std::unique_ptr<column> fill(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -150,8 +151,8 @@ std::unique_ptr<table> repeat(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step.
@@ -181,8 +182,8 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step of 1.
@@ -208,8 +209,8 @@ std::unique_ptr<column> sequence(
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
@@ -239,8 +240,8 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 4445af6c5a8..e39d75757e8 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -67,18 +67,6 @@ constexpr inline auto is_supported_representation_type()
          cuda::std::is_same_v<T, __int128_t>;
 }
 
-/**
- * @brief Returns `true` if the value type is supported for constructing a `fixed_point`
- *
- * @tparam T The construction value type
- * @return `true` if the value type is supported to construct a `fixed_point` type
- */
-template <typename T>
-constexpr inline auto is_supported_construction_value_type()
-{
-  return cuda::std::is_integral<T>() || cuda::std::is_floating_point_v<T>;
-}
-
 /** @} */  // end of group
 
 // Helper functions for `fixed_point` type
@@ -222,23 +210,8 @@ class fixed_point {
   scale_type _scale;
 
  public:
-  using rep = Rep;  ///< The representation type
-
-  /**
-   * @brief Constructor that will perform shifting to store value appropriately (from floating point
-   * types)
-   *
-   * @tparam T The floating point type that you are constructing from
-   * @param value The value that will be constructed from
-   * @param scale The exponent that is applied to Rad to perform shifting
-   */
-  template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
-                                            is_supported_representation_type<Rep>()>* = nullptr>
-  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
-    : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
-  {
-  }
+  using rep                 = Rep;  ///< The representation type
+  static constexpr auto rad = Rad;  ///< The base
 
   /**
    * @brief Constructor that will perform shifting to store value appropriately (from integral
@@ -249,7 +222,7 @@ class fixed_point {
    * @param scale The exponent that is applied to Rad to perform shifting
    */
   template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
+            typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
   CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
@@ -275,8 +248,7 @@ class fixed_point {
    * @tparam T The value type being constructing from
    * @param value The value that will be constructed from
    */
-  template <typename T,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
+  template <typename T, typename cuda::std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline fixed_point(T const& value)
     : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
@@ -288,19 +260,6 @@ class fixed_point {
    */
   CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {}
 
-  /**
-   * @brief Explicit conversion operator for casting to floating point types
-   *
-   * @tparam U The floating point type that is being explicitly converted to
-   * @return The `fixed_point` number in base 10 (aka human readable format)
-   */
-  template <typename U,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<U>>* = nullptr>
-  explicit constexpr operator U() const
-  {
-    return detail::shift<Rep, Rad>(static_cast<U>(_value), scale_type{-_scale});
-  }
-
   /**
    * @brief Explicit conversion operator for casting to integral types
    *
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 1c31e8777a8..831ef68ed15 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -184,17 +185,17 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @copydoc aggregate(host_span<aggregation_request const>, rmm::mr::device_memory_resource*)
+   * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -248,7 +249,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped shifts for specified values.
@@ -304,7 +305,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
@@ -332,8 +333,8 @@ class groupby {
    * returned groups
    * @return A `groups` object representing grouped keys and values
    */
-  groups get_groups(cudf::table_view values             = {},
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  groups get_groups(cudf::table_view values           = {},
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped replace nulls on @p value
@@ -373,7 +374,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
  private:
   table_view _keys;                                      ///< Keys that determine grouping
@@ -404,18 +405,18 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
     host_span<scan_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 };
 /** @} */
 }  // namespace groupby
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 64a78da1803..3c2f6dfe0d5 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -34,42 +35,11 @@ namespace cudf {
  */
 using hash_value_type = uint32_t;
 
-/**
- * @brief Identifies the hash function to be used
- *
- */
-enum class hash_id {
-  HASH_IDENTITY = 0,   ///< Identity hash function that simply returns the key to be hashed
-  HASH_MURMUR3,        ///< Murmur3 hash function
-  HASH_SPARK_MURMUR3,  ///< Spark Murmur3 hash function
-  HASH_MD5             ///< MD5 hash function
-};
-
 /**
  * @brief The default seed value for hash functions
  */
 static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 
-/**
- * @brief Computes the hash value of each row in the input set of columns.
- *
- * @deprecated Since 23.08
- *
- * @param input The table of columns to hash
- * @param hash_function The hash function enum to use
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a column from the input
- */
-[[deprecated]] std::unique_ptr<column> hash(
-  table_view const& input,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 //! Hash APIs
 namespace hashing {
 
@@ -89,9 +59,9 @@ namespace hashing {
  */
 std::unique_ptr<column> murmurhash3_x86_32(
   table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
@@ -108,30 +78,9 @@ std::unique_ptr<column> murmurhash3_x86_32(
  */
 std::unique_ptr<table> murmurhash3_x64_128(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
- *
- * @deprecated Since 24.04
- *
- * This function computes the hash similar to MurmurHash3_x86_32 with special processing
- * to match Spark's implementation results.
- *
- * @param input The table of columns to hash
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a row from the input
- */
-[[deprecated]] std::unique_ptr<column> spark_murmurhash3_x86_32(
-  table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MD5 hash value of each row in the given table
@@ -144,8 +93,8 @@ std::unique_ptr<table> murmurhash3_x64_128(
  */
 std::unique_ptr<column> md5(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-1 hash value of each row in the given table
@@ -158,8 +107,8 @@ std::unique_ptr<column> md5(
  */
 std::unique_ptr<column> sha1(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-224 hash value of each row in the given table
@@ -172,8 +121,8 @@ std::unique_ptr<column> sha1(
  */
 std::unique_ptr<column> sha224(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-256 hash value of each row in the given table
@@ -186,8 +135,8 @@ std::unique_ptr<column> sha224(
  */
 std::unique_ptr<column> sha256(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-384 hash value of each row in the given table
@@ -200,8 +149,8 @@ std::unique_ptr<column> sha256(
  */
 std::unique_ptr<column> sha384(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-512 hash value of each row in the given table
@@ -214,8 +163,8 @@ std::unique_ptr<column> sha384(
  */
 std::unique_ptr<column> sha512(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
@@ -231,9 +180,9 @@ std::unique_ptr<column> sha512(
  */
 std::unique_ptr<column> xxhash_64(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace hashing
 
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index eaeb5d6b068..77266ceb48f 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 #include <functional>
@@ -30,46 +31,41 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view,
-                                                 rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 2ee6f19614d..bb05a622f40 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -34,11 +36,16 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 struct DLManagedTensor;
 
+struct ArrowDeviceArray;
+
+struct ArrowSchema;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -65,7 +72,7 @@ namespace cudf {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a cudf table into a DLPack DLTensor
@@ -87,7 +94,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -162,6 +169,159 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
                                         rmm::cuda_stream_view stream = cudf::get_default_stream(),
                                         arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
+ *
+ */
+using unique_schema_t = std::unique_ptr<ArrowSchema, void (*)(ArrowSchema*)>;
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowDeviceArray with a custom deleter
+ *
+ */
+using unique_device_array_t = std::unique_ptr<ArrowDeviceArray, void (*)(ArrowDeviceArray*)>;
+
+/**
+ * @brief Create ArrowSchema from cudf table and metadata
+ *
+ * Populates and returns an ArrowSchema C struct using a table and metadata.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * decimals will be converted to an Arrow decimal128 which has the widest precision that cudf
+ * decimal type supports. For example, `numeric::decimal32` will be converted to Arrow decimal128
+ * with the precision of 9 which is the maximum precision for 32-bit types. Similarly,
+ * `numeric::decimal128` will be converted to Arrow decimal128 with the precision of 38.
+ *
+ * @param input Table to create a schema from
+ * @param metadata Contains the hierarchy of names of columns and children
+ * @return ArrowSchema generated from `input`
+ */
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata);
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf table and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow
+ * such as in the representation of bools (Arrow uses a bitmap, cudf uses 1-byte per value).
+ *
+ * @param table Input table, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data, consumer must call release
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table&& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf column and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow such as
+ * in the representation of bools (Arrow uses a bitmap, cudf uses 1 byte per value).
+ *
+ * @param col Input column, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column&& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from a table view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the table data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the table_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from a column view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the column data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the column_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
@@ -170,11 +330,10 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
@@ -184,9 +343,132 @@ std::unique_ptr<table> from_arrow(
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-
 std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
+ *
+ */
+using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;
+
+/**
+ * @brief functor for a custom deleter to a unique_ptr of table_view
+ *
+ * When converting from an ArrowDeviceArray, there are cases where data can't
+ * be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
+ * is used to maintain ownership over the data allocated since a `cudf::table_view`
+ * doesn't hold ownership.
+ */
+template <typename ViewType>
+struct custom_view_deleter {
+  /**
+   * @brief Construct a new custom view deleter object
+   *
+   * @param owned Vector of owning columns
+   */
+  explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}
+
+  /**
+   * @brief operator to delete the unique_ptr
+   *
+   * @param ptr Pointer to the object to be deleted
+   */
+  void operator()(ViewType* ptr) const { delete ptr; }
+
+  owned_columns_t owned_mem_;  ///< Owned columns that must be deleted.
+};
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
+ *
+ */
+using unique_table_view_t =
+  std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;
+
+/**
+ * @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::table_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::table_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error if the input array is not a struct array, non-struct
+ * arrays should be passed to `from_arrow_device_column` instead.
+ *
+ * @throws cudf::data_type_error if the input arrow data type is not supported.
+ *
+ * Each child of the input struct will be the columns of the resulting table_view.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::table_view` generated from given Arrow data
+ */
+unique_table_view_t from_arrow_device(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
+ *
+ */
+using unique_column_view_t =
+  std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;
+
+/**
+ * @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::column_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::column_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error input arrow data type is not supported.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::column_view` generated from given Arrow data
+ */
+unique_column_view_t from_arrow_device_column(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
new file mode 100644
index 00000000000..8043ecf5422
--- /dev/null
+++ b/cpp/include/cudf/interop/detail/arrow.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanoarrow/nanoarrow.hpp>
+
+// from Arrow C Device Data Interface
+// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Device type for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+
+struct ArrowDeviceArray {
+  struct ArrowArray array;
+  int64_t device_id;
+  ArrowDeviceType device_type;
+  void* sync_event;
+
+  // reserved bytes for future expansion
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 89207302850..8bc74eb574c 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -216,7 +217,7 @@ class avro_reader_options_builder {
  */
 table_with_metadata read_avro(
   avro_reader_options const& options,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 435583e805d..a20f75cecd7 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -105,6 +106,9 @@ class csv_reader_options {
   char _quotechar = '"';
   // Whether a quote inside a value is double-quoted
   bool _doublequote = true;
+  // Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no effect when
+  // _doublequote is true
+  bool _detect_whitespace_around_quotes = false;
   // Names of columns to read as datetime
   std::vector<std::string> _parse_dates_names;
   // Indexes of columns to read as datetime
@@ -374,6 +378,17 @@ class csv_reader_options {
    */
   [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
 
+  /**
+   * @brief Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @return `true` if detect_whitespace_around_quotes is enabled
+   */
+  [[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const
+  {
+    return _detect_whitespace_around_quotes;
+  }
+
   /**
    * @brief Returns names of columns to read as datetime.
    *
@@ -697,6 +712,14 @@ class csv_reader_options {
    */
   void enable_doublequote(bool val) { _doublequote = val; }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
@@ -1125,6 +1148,19 @@ class csv_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  csv_reader_options_builder& detect_whitespace_around_quotes(bool val)
+  {
+    options._detect_whitespace_around_quotes = val;
+    return *this;
+  }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
@@ -1315,8 +1351,8 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -1721,8 +1757,8 @@ class csv_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index fede8e62d9f..fe9f935d2cc 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -39,7 +40,7 @@ namespace avro {
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 40ddcf385b0..50c1a7c163d 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/io/csv.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -38,7 +39,7 @@ namespace csv {
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to CSV format.
@@ -55,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr);
+               rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 3f7f7e9bb32..540a584908d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json::detail {
 
@@ -35,7 +37,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to JSON format.
@@ -50,27 +52,27 @@ void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 3c1486b60c2..597ddd9cf0a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -37,13 +38,15 @@ class chunked_orc_writer_options;
 
 namespace orc::detail {
 
+// Forward declaration of the internal reader class
+class reader_impl;
+
 /**
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
  private:
-  class impl;
-  std::unique_ptr<impl> _impl;
+  std::unique_ptr<reader_impl> _impl;
 
  public:
   /**
@@ -57,7 +60,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   orc_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
@@ -67,10 +70,63 @@ class reader {
   /**
    * @brief Reads the entire dataset.
    *
-   * @param options Settings for controlling reading behavior
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options);
+  table_with_metadata read();
+};
+
+/**
+ * @brief The reader class that supports iterative reading from an array of data sources.
+ */
+class chunked_reader {
+ private:
+  std::unique_ptr<reader_impl> _impl;
+
+ public:
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          size_type output_row_granularity,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
 };
 
 /**
@@ -124,14 +180,7 @@ class writer {
    * @brief Finishes the chunked/streamed write process.
    */
   void close();
-
-  /**
-   * @brief Skip work done in `close()`; should be called if `write()` failed.
-   *
-   * Calling skip_close() prevents the writer from writing the (invalid) file footer and the
-   * postscript.
-   */
-  void skip_close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 0b8ee9676de..978216d971e 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -65,7 +66,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -75,11 +76,9 @@ class reader {
   /**
    * @brief Reads the dataset as per given options.
    *
-   * @param options Settings for controlling reading behavior
-   *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(parquet_reader_options const& options);
+  table_with_metadata read();
 };
 
 /**
@@ -101,6 +100,13 @@ class chunked_reader : private reader {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * If `chunk_read_limit == 0` (i.e., no output limit), and `pass_read_limit == 0` (no input
@@ -110,7 +116,7 @@ class chunked_reader : private reader {
    * The chunk_read_limit parameter controls the size of the output chunks produces.  If the user
    * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have
    * a total bytes size (over all columns) of 100 MB or less.  This is a soft limit and the code
-   * will not fail if it cannot satisfy the limit.  It will make a best-effort atttempt only.
+   * will not fail if it cannot satisfy the limit.  It will make a best-effort attempt only.
    *
    * The pass_read_limit parameter controls how much temporary memory is used in the process of
    * decoding the file.  The primary contributor to this memory usage is the uncompressed size of
@@ -145,7 +151,7 @@ class chunked_reader : private reader {
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           parquet_reader_options const& options,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+                          rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header.
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b2ea29a85c3..d08c4e7c65a 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json {
 
@@ -133,7 +134,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 1f2628deea7..aa4bee4fb5e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <map>
 #include <string>
@@ -100,6 +101,10 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Delimiter separating records in JSON lines
+  char _delimiter = '\n';
+  // Prune columns on read, selected based on the _dtypes option
+  bool _prune_columns = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -226,6 +231,13 @@ class json_reader_options {
     return base_padding + num_columns * column_bytes;
   }
 
+  /**
+   * @brief Returns delimiter separating records in JSON lines
+   *
+   * @return Delimiter separating records in JSON lines
+   */
+  char get_delimiter() const { return _delimiter; }
+
   /**
    * @brief Whether to read the file as a json object per line.
    *
@@ -240,6 +252,17 @@ class json_reader_options {
    */
   bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
+  /**
+   * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @return True if column pruning is enabled
+   */
+  bool is_enabled_prune_columns() const { return _prune_columns; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -250,9 +273,11 @@ class json_reader_options {
   /**
    * @brief Whether the legacy reader should be used.
    *
+   * @deprecated Since 24.06
+   *
    * @returns true if the legacy reader will be used, false otherwise
    */
-  bool is_enabled_legacy() const { return _legacy; }
+  [[deprecated]] bool is_enabled_legacy() const { return _legacy; }
 
   /**
    * @brief Whether the reader should keep quotes of string values.
@@ -324,6 +349,30 @@ class json_reader_options {
    */
   void set_byte_range_size(size_type size) { _byte_range_size = size; }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   */
+  void set_delimiter(char delimiter)
+  {
+    switch (delimiter) {
+      case '{':
+      case '[':
+      case '}':
+      case ']':
+      case ',':
+      case ':':
+      case '"':
+      case '\'':
+      case '\\':
+      case ' ':
+      case '\t':
+      case '\r': CUDF_FAIL("Unsupported delimiter character.", std::invalid_argument); break;
+    }
+    _delimiter = delimiter;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
@@ -339,6 +388,17 @@ class json_reader_options {
    */
   void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   */
+  void enable_prune_columns(bool val) { _prune_columns = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -349,9 +409,11 @@ class json_reader_options {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable the legacy reader
    */
-  void enable_legacy(bool val) { _legacy = val; }
+  [[deprecated]] void enable_legacy(bool val) { _legacy = val; }
 
   /**
    * @brief Set whether the reader should keep quotes of string values.
@@ -478,6 +540,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   * @return this for chaining
+   */
+  json_reader_options_builder& delimiter(char delimiter)
+  {
+    options.set_delimiter(delimiter);
+    return *this;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
@@ -503,6 +577,22 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref dtypes option.
+   *
+   * When set as true, if the reader options include @ref dtypes, then
+   * the reader will only return those columns which are mentioned in @ref dtypes.
+   * If false, then all columns are returned, independent of the @ref dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   * @return this for chaining
+   */
+  json_reader_options_builder& prune_columns(bool val)
+  {
+    options._prune_columns = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -518,10 +608,12 @@ class json_reader_options_builder {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable legacy parsing
    * @return this for chaining
    */
-  json_reader_options_builder& legacy(bool val)
+  [[deprecated]] json_reader_options_builder& legacy(bool val)
   {
     options._legacy = val;
     return *this;
@@ -612,8 +704,8 @@ class json_reader_options_builder {
  */
 table_with_metadata read_json(
   json_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -959,8 +1051,8 @@ class json_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index ea79d6a3029..a36e220ae7b 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -18,6 +18,8 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <optional>
+
 namespace cudf::io {
 
 /**
@@ -41,4 +43,23 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
  */
 rmm::host_async_resource_ref get_host_memory_resource();
 
+/**
+ * @brief Options to configure the default host memory resource
+ */
+struct host_mr_options {
+  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default host memory
+                                    ///< resource. If not set, the default pool size is used.
+};
+
+/**
+ * @brief Configure the size of the default host memory resource.
+ *
+ * @throws cudf::logic_error if called after the default host memory resource has been created
+ *
+ * @param opts Options to configure the default host memory resource
+ * @return True if this call successfully configured the host memory resource, false if a
+ * a resource was already configured.
+ */
+bool config_default_host_memory_resource(host_mr_options const& opts);
+
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 5cc9ea81f29..8140f8897b7 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -57,10 +58,10 @@ class orc_reader_options {
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
-  // Rows to skip from the start; ORC stores the number of rows as uint64_t
-  uint64_t _skip_rows = 0;
+  // Rows to skip from the start
+  int64_t _skip_rows = 0;
   // Rows to read; `nullopt` is all
-  std::optional<size_type> _num_rows;
+  std::optional<int64_t> _num_rows;
 
   // Whether to use row index to speed-up reading
   bool _use_index = true;
@@ -124,7 +125,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  uint64_t get_skip_rows() const { return _skip_rows; }
+  int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -132,7 +133,7 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
+  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
@@ -197,10 +198,10 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_skip_rows(uint64_t rows)
+  void set_skip_rows(int64_t rows)
   {
+    CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
-    CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
     _skip_rows = rows;
   }
 
@@ -212,7 +213,7 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_num_rows(size_type nrows)
+  void set_num_rows(int64_t nrows)
   {
     CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
     CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
@@ -270,7 +271,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the column to read.
@@ -302,7 +303,7 @@ class orc_reader_options_builder {
    * @param rows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& skip_rows(uint64_t rows)
+  orc_reader_options_builder& skip_rows(int64_t rows)
   {
     options.set_skip_rows(rows);
     return *this;
@@ -314,7 +315,7 @@ class orc_reader_options_builder {
    * @param nrows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& num_rows(size_type nrows)
+  orc_reader_options_builder& num_rows(int64_t nrows)
   {
     options.set_num_rows(nrows);
     return *this;
@@ -402,8 +403,146 @@ class orc_reader_options_builder {
  */
 table_with_metadata read_orc(
   orc_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief The chunked orc reader class to read an ORC file iteratively into a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Construct the reader from input/output size limits, output row granularity, along with
+   * other ORC reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
+   *
+   * The `chunk_read_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
+   *
+   * The `pass_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   *
+   * @throw cudf::logic_error if `output_row_granularity` is non-positive
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    size_type output_row_granularity,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from input/output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and
+   * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given data sources has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given data sources.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given data sources at once.
+   *
+   * An empty table will be returned if the given sources are empty, or all the data has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
 
 /** @} */  // end of group
 /**
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 8f3eb1dff3c..35196a19349 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -154,6 +154,21 @@ struct timestamp_statistics : minmax_statistics<int64_t> {
   std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
+/**
+ * @brief Variant type for ORC type-specific column statistics.
+ *
+ * The variant can hold any of the supported column statistics types.
+ */
+using statistics_type = std::variant<no_statistics,
+                                     integer_statistics,
+                                     double_statistics,
+                                     string_statistics,
+                                     bucket_statistics,
+                                     decimal_statistics,
+                                     date_statistics,
+                                     binary_statistics,
+                                     timestamp_statistics>;
+
 //! Orc I/O interfaces
 namespace orc {
 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
@@ -171,16 +186,7 @@ struct column_statistics;
 struct column_statistics {
   std::optional<uint64_t> number_of_values;  ///< number of statistics
   std::optional<bool> has_null;              ///< column has any nulls
-  std::variant<no_statistics,
-               integer_statistics,
-               double_statistics,
-               string_statistics,
-               bucket_statistics,
-               decimal_statistics,
-               date_statistics,
-               binary_statistics,
-               timestamp_statistics>
-    type_specific_stats;  ///< type-specific statistics
+  statistics_type type_specific_stats;       ///< type-specific statistics
 
   /**
    * @brief Construct a new column statistics object
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index dc035db8d39..b2f949cdcee 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -23,6 +23,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iostream>
 #include <memory>
@@ -70,6 +71,8 @@ class parquet_reader_options {
   bool _convert_strings_to_categories = false;
   // Whether to use PANDAS metadata to load columns
   bool _use_pandas_metadata = true;
+  // Whether to read and use ARROW schema
+  bool _use_arrow_schema = true;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -125,6 +128,13 @@ class parquet_reader_options {
    */
   [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
 
+  /**
+   * @brief Returns true/false depending whether to use arrow schema while reading.
+   *
+   * @return `true` if arrow schema is used while reading
+   */
+  [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
+
   /**
    * @brief Returns optional tree of metadata.
    *
@@ -195,6 +205,31 @@ class parquet_reader_options {
   /**
    * @brief Sets AST based filter for predicate pushdown.
    *
+   * The filter can utilize cudf::ast::column_name_reference to reference a column by its name,
+   * even if it's not necessarily present in the requested projected columns.
+   * To refer to output column indices, you can use cudf::ast::column_reference.
+   *
+   * For a parquet with columns ["A", "B", "C", ... "X", "Y", "Z"],
+   * Example 1: with/without column projection
+   * @code
+   * use_columns({"A", "X", "Z"})
+   * .filter(operation(ast_operator::LESS, column_name_reference{"C"}, literal{100}));
+   * @endcode
+   * Column "C" need not be present in output table.
+   * Example 2: without column projection
+   * @code
+   * filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "B" because output will contain all columns in
+   * order ["A", ..., "Z"].
+   * Example 3: with column projection
+   * @code
+   * use_columns({"A", "Z", "X"})
+   * .filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "Z" because output will contain 3 columns in
+   * order ["A", "Z", "X"].
+   *
    * @param filter AST expression to use as filter
    */
   void set_filter(ast::expression const& filter) { _filter = filter; }
@@ -213,6 +248,13 @@ class parquet_reader_options {
    */
   void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   */
+  void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
+
   /**
    * @brief Sets reader column schema.
    *
@@ -292,9 +334,7 @@ class parquet_reader_options_builder {
   }
 
   /**
-   * @brief Sets vector of individual row groups to read.
-   *
-   * @param filter Vector of row groups to read
+   * @copydoc parquet_reader_options::set_filter
    * @return this for chaining
    */
   parquet_reader_options_builder& filter(ast::expression const& filter)
@@ -327,6 +367,18 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& use_arrow_schema(bool val)
+  {
+    options._use_arrow_schema = val;
+    return *this;
+  }
+
   /**
    * @brief Sets reader metadata.
    *
@@ -409,8 +461,8 @@ class parquet_reader_options_builder {
  */
 table_with_metadata read_parquet(
   parquet_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
@@ -446,8 +498,8 @@ class chunked_parquet_reader {
   chunked_parquet_reader(
     std::size_t chunk_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Constructor for chunked reader.
@@ -472,8 +524,8 @@ class chunked_parquet_reader {
     std::size_t chunk_read_limit,
     std::size_t pass_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
@@ -515,6 +567,15 @@ class chunked_parquet_reader {
  * @file
  */
 
+/**
+ * @brief Struct used to describe column sorting metadata
+ */
+struct sorting_column {
+  int column_idx{};           //!< leaf column index within the row group
+  bool is_descending{false};  //!< true if sort order is descending
+  bool is_nulls_first{true};  //!< true if nulls come before non-null values
+};
+
 class parquet_writer_options_builder;
 
 /**
@@ -554,7 +615,7 @@ class parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -563,6 +624,8 @@ class parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink and table.
@@ -761,6 +824,13 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets partitions.
    *
@@ -892,6 +962,16 @@ class parquet_writer_options {
    * @param val Boolean value to enable/disable writing of V2 page headers.
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
 };
 
 /**
@@ -1066,7 +1146,7 @@ class parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
@@ -1143,6 +1223,14 @@ class parquet_writer_options_builder {
    */
   parquet_writer_options_builder& write_v2_headers(bool enabled);
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& sorting_columns(std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move parquet_writer_options member once it's built.
    */
@@ -1221,7 +1309,7 @@ class chunked_parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -1230,6 +1318,8 @@ class chunked_parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink.
@@ -1384,6 +1474,13 @@ class chunked_parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets metadata.
    *
@@ -1501,6 +1598,16 @@ class chunked_parquet_writer_options {
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
 
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -1695,7 +1802,7 @@ class chunked_parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
@@ -1740,6 +1847,15 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& sorting_columns(
+    std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
index 3149b5b5945..e0c406c180c 100644
--- a/cpp/include/cudf/io/parquet_metadata.hpp
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,13 @@ enum class TypeKind : int8_t {
  */
 struct parquet_column_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_column_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -134,6 +141,13 @@ struct parquet_column_schema {
  */
 struct parquet_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -165,6 +179,15 @@ class parquet_metadata {
  public:
   /// Key-value metadata in the file footer.
   using key_value_metadata = std::unordered_map<std::string, std::string>;
+  /// row group metadata from each RowGroup element.
+  using row_group_metadata = std::unordered_map<std::string, int64_t>;
+
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_metadata() = default;
 
   /**
    * @brief constructor
@@ -173,15 +196,18 @@ class parquet_metadata {
    * @param num_rows number of rows
    * @param num_rowgroups number of row groups
    * @param file_metadata key-value metadata in the file footer
+   * @param rg_metadata vector of maps containing metadata for each row group
    */
   parquet_metadata(parquet_schema schema,
                    int64_t num_rows,
                    size_type num_rowgroups,
-                   key_value_metadata file_metadata)
+                   key_value_metadata file_metadata,
+                   std::vector<row_group_metadata> rg_metadata)
     : _schema{std::move(schema)},
       _num_rows{num_rows},
       _num_rowgroups{num_rowgroups},
-      _file_metadata{std::move(file_metadata)}
+      _file_metadata{std::move(file_metadata)},
+      _rowgroup_metadata{std::move(rg_metadata)}
   {
   }
 
@@ -207,6 +233,7 @@ class parquet_metadata {
    * @return Number of row groups
    */
   [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
+
   /**
    * @brief Returns the Key value metadata in the file footer.
    *
@@ -214,11 +241,19 @@ class parquet_metadata {
    */
   [[nodiscard]] auto const& metadata() const { return _file_metadata; }
 
+  /**
+   * @brief Returns the row group metadata in the file footer.
+   *
+   * @return vector of row group metadata as maps
+   */
+  [[nodiscard]] auto const& rowgroup_metadata() const { return _rowgroup_metadata; }
+
  private:
   parquet_schema _schema;
   int64_t _num_rows;
   size_type _num_rowgroups;
   key_value_metadata _file_metadata;
+  std::vector<row_group_metadata> _rowgroup_metadata;
 };
 
 /**
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index d42624aa9b7..aa9185b4983 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <rmm/resource_ref.hpp>
+
 #include <cub/block/block_scan.cuh>
 #include <cuda/atomic>
 
@@ -81,7 +83,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 7bb2e4e2ece..e0b9c7635e3 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <queue>
@@ -165,7 +166,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +182,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index a7edc9be0e4..7abae7c754b 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -85,18 +86,18 @@ struct parse_options {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  parse_options options               = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  parse_options options             = {},
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 64d627483e6..150e997f533 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -113,6 +113,7 @@ enum class column_encoding {
                             ///< valid for BYTE_ARRAY columns)
   DELTA_BYTE_ARRAY,         ///< Use DELTA_BYTE_ARRAY encoding (only valid for
                             ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
+  BYTE_STREAM_SPLIT,        ///< Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
   // ORC encodings:
   DIRECT,         ///< Use DIRECT encoding
   DIRECT_V2,      ///< Use DIRECT_V2 encoding
@@ -235,6 +236,8 @@ enum dictionary_policy {
 struct column_name_info {
   std::string name;                        ///< Column name
   std::optional<bool> is_nullable;         ///< Column nullability
+  std::optional<bool> is_binary;           ///< Column is binary (i.e. not a list)
+  std::optional<int32_t> type_length;      ///< Byte width of data (for fixed length data)
   std::vector<column_name_info> children;  ///< Child column names
 
   /**
@@ -242,9 +245,12 @@ struct column_name_info {
    *
    * @param _name Column name
    * @param _is_nullable True if column is nullable
+   * @param _is_binary True if column is binary data
    */
-  column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
-    : name(_name), is_nullable(_is_nullable)
+  column_name_info(std::string const& _name,
+                   std::optional<bool> _is_nullable = std::nullopt,
+                   std::optional<bool> _is_binary   = std::nullopt)
+    : name(_name), is_nullable(_is_nullable), is_binary(_is_binary)
   {
   }
 
@@ -602,8 +608,10 @@ class column_in_metadata {
   bool _list_column_is_map  = false;
   bool _use_int96_timestamp = false;
   bool _output_as_binary    = false;
+  bool _skip_compression    = false;
   std::optional<uint8_t> _decimal_precision;
   std::optional<int32_t> _parquet_field_id;
+  std::optional<int32_t> _type_length;
   std::vector<column_in_metadata> children;
   column_encoding _encoding = column_encoding::USE_DEFAULT;
 
@@ -691,6 +699,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the data length of the column. Only valid if this column is a
+   * fixed-length byte array.
+   *
+   * @param length The data length to set for this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_type_length(int32_t length) noexcept
+  {
+    _type_length = length;
+    return *this;
+  }
+
   /**
    * @brief Set the parquet field id of this column.
    *
@@ -722,6 +743,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Specifies whether this column should not be compressed regardless of the compression
+   * codec specified for the file.
+   *
+   * @param skip If `true` do not compress this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_skip_compression(bool skip) noexcept
+  {
+    _skip_compression = skip;
+    return *this;
+  }
+
   /**
    * @brief Sets the encoding to use for this column.
    *
@@ -811,6 +845,22 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether type length has been set for this column
+   *
+   * @return Boolean indicating whether type length has been set for this column
+   */
+  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
+
+  /**
+   * @brief Get the type length that was set for this column.
+   *
+   * @throws std::bad_optional_access If type length was not set for this
+   *         column. Check using `is_type_length_set()` first.
+   * @return The decimal precision that was set for this column
+   */
+  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
+
   /**
    * @brief Get whether parquet field id has been set for this column.
    *
@@ -844,6 +894,13 @@ class column_in_metadata {
    */
   [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
 
+  /**
+   * @brief Get whether to skip compressing this column
+   *
+   * @return Boolean indicating whether to skip compression of this column
+   */
+  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
+
   /**
    * @brief Get the encoding that was set for this column.
    *
@@ -910,6 +967,7 @@ struct partition_info {
 class reader_column_schema {
   // Whether to read binary data as a string column
   bool _convert_binary_to_strings{true};
+  int32_t _type_length{0};
 
   std::vector<reader_column_schema> children;
 
@@ -975,6 +1033,18 @@ class reader_column_schema {
     return *this;
   }
 
+  /**
+   * @brief Sets the length of fixed length data.
+   *
+   * @param type_length Size of the data type in bytes
+   * @return this for chaining
+   */
+  reader_column_schema& set_type_length(int32_t type_length)
+  {
+    _type_length = type_length;
+    return *this;
+  }
+
   /**
    * @brief Get whether to encode this column as binary or string data
    *
@@ -985,6 +1055,13 @@ class reader_column_schema {
     return _convert_binary_to_strings;
   }
 
+  /**
+   * @brief Get the length in bytes of this fixed length data.
+   *
+   * @return The length in bytes of the data type
+   */
+  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
+
   /**
    * @brief Get the number of child objects
    *
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index b7a3129cfec..825f758adbd 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 #include <utility>
@@ -95,8 +96,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
-           null_equality compare_nulls         = null_equality::EQUAL,
-           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+           null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -135,8 +136,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -174,8 +175,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left semi-join
@@ -202,8 +203,8 @@ full_join(cudf::table_view const& left_keys,
 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
@@ -233,8 +234,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
@@ -261,7 +262,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The enum class to specify if any of the input join tables (`build` table and any later
@@ -340,7 +341,7 @@ class hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -365,7 +366,7 @@ class hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -390,7 +391,7 @@ class hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
@@ -441,8 +442,8 @@ class hash_join {
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   const std::unique_ptr<impl_type const> _impl;
@@ -497,8 +498,8 @@ class distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+             rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Returns the build table indices that can be used to construct the result of performing
@@ -515,8 +516,8 @@ class distinct_hash_join {
    * join between two tables with `build` and `probe` as the join keys.
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
@@ -561,12 +562,11 @@ class distinct_hash_join {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_inner_join(
-  table_view const& left,
-  table_view const& right,
-  ast::expression const& binary_predicate,
-  std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+conditional_inner_join(table_view const& left,
+                       table_view const& right,
+                       ast::expression const& binary_predicate,
+                       std::optional<std::size_t> output_size = {},
+                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -611,7 +611,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -653,7 +653,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -692,7 +692,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -731,7 +731,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -789,7 +789,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -849,7 +849,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -909,7 +909,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -944,9 +944,6 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -958,9 +955,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -996,9 +992,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1010,9 +1003,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1051,8 +1043,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1091,86 +1083,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left semi join between the specified tables where the columns of the
- * equality table are equal and the predicate evaluates to true on the
- * conditional tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left anti join between the specified tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1193,7 +1107,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1216,7 +1130,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1239,7 +1153,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1262,6 +1176,6 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 944e0c26dd6..385e8e54bdc 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -167,9 +168,9 @@ class get_json_object_options {
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
-  get_json_object_options options     = get_json_object_options{},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  get_json_object_options options   = get_json_object_options{},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index d8ea262dfe1..9091e31a9ea 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -74,8 +75,8 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0d9c1c157eb..853562acfff 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenating multiple lists on the same row of a lists column into a single list.
@@ -97,7 +98,7 @@ std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 7cf67ec9205..060882555aa 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -49,8 +50,8 @@ namespace lists {
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether the list rows of the first
@@ -73,8 +74,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether each row in the `lists` column
@@ -95,8 +96,8 @@ std::unique_ptr<column> contains(
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Option to choose whether `index_of()` returns the first or last match
@@ -138,9 +139,9 @@ enum class duplicate_find_option : int32_t {
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of values indicating the position of a search key
@@ -175,9 +176,9 @@ std::unique_ptr<column> index_of(
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index e4bd0dca9ae..2b9f5aa5607 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -51,8 +52,8 @@ namespace lists {
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 4bc45e48a9f..bd4c01bbb4b 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -30,7 +32,7 @@ namespace detail {
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
@@ -40,7 +42,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index a1f149d4ccf..d67958ef260 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 58ec18cb9ef..638cc7afb81 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -26,49 +28,49 @@ namespace detail {
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 3760294f079..18a70bba5e9 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -46,7 +47,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 013f9b491dd..6f983d44bc9 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,29 +18,31 @@
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 03428bc347f..0cd77556f33 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -59,7 +60,7 @@ struct gather_data {
  *                                 MapItType gather_map,
  *                                 size_type gather_map_size,
  *                                 rmm::cuda_stream_view stream,
- *                                 rmm::mr::device_memory_resource* mr)
+ *                                 rmm::device_async_resource_ref mr)
  *
  * @param prev_base_offsets The buffer backing the base offsets used in the gather map. We can
  *                          free this buffer before allocating the new one to keep peak memory
@@ -71,7 +72,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              size_type gather_map_size,
                              rmm::device_uvector<int32_t>&& prev_base_offsets,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
@@ -252,7 +253,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              MapItType gather_map,
                              size_type gather_map_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return make_gather_data<NullifyOutOfBounds, MapItType>(
     source_column,
@@ -278,7 +279,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Gather a leaf column from a hierarchy of list columns.
@@ -295,13 +296,13 @@ std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
  *                                        lists_column_view const& gather_map_list,
  *                                        out_of_bounds_policy bounds_policy,
- *                                        rmm::mr::device_memory_resource* mr)
+ *                                        rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream on which to execute kernels
  */
@@ -309,7 +310,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index a5cf67c95b9..3aff93840a9 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -47,7 +48,7 @@ namespace detail {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 7b821a00b0d..192aee8d811 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -38,7 +39,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr);
+                                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty lists column.
@@ -51,7 +52,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
  */
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a lists column with all null rows.
@@ -64,7 +65,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index 6e3b952a3b0..d099a0708b9 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/lists/reverse.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 /**
@@ -25,6 +27,6 @@ namespace cudf::lists::detail {
  */
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 5fc52ff1c04..c550ad5b94f 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -53,7 +54,7 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
   IndexIterator index_begin,
   IndexIterator index_end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto n_rows = thrust::distance(index_begin, index_end);
 
@@ -98,9 +99,9 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      column_view const& source,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
+  CUDF_EXPECTS(have_same_types(source, target), "Mismatched column types.");
 
   auto const child_column_type = lists_column_view(target).child().type();
 
@@ -177,7 +178,7 @@ std::unique_ptr<column> scatter(column_view const& source,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
@@ -233,7 +234,7 @@ std::unique_ptr<column> scatter(scalar const& slr,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 605f76871b5..fc44e0bc290 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -136,7 +137,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index 51fc58bee07..8746b1ba62a 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -35,7 +36,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
@@ -47,7 +48,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::union_distinct
@@ -59,7 +60,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
@@ -71,7 +72,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index c378ca8cf06..e428ea84ce6 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -32,7 +33,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::stable_sort_lists
@@ -43,7 +44,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index 7ab9cf9a343..f5e5b29bc8f 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,19 +19,20 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
 /**
  * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::distinct
@@ -42,6 +43,6 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index adf46805855..81d82dcfa09 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -72,7 +73,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements and includes a position column.
@@ -116,7 +117,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
@@ -158,7 +159,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 14c0f59e17d..096d276fcfb 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -66,8 +67,8 @@ namespace lists {
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column where each row is a single element from the corresponding sublist
@@ -107,8 +108,8 @@ std::unique_ptr<column> extract_list_element(
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 3730e16482d..1d840c76bf8 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,8 +67,8 @@ namespace cudf::lists {
 std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
@@ -108,8 +109,8 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& steps,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 5e6ab6816e6..a0d79c05098 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -73,9 +74,9 @@ namespace lists {
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 864cd796f72..34c40c5a3ba 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,8 +49,8 @@ namespace cudf::lists {
  */
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 6fb8989f0bb..b8abfd62461 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 /**
@@ -59,10 +60,10 @@ namespace cudf::lists {
 std::unique_ptr<column> have_overlap(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements common to two input lists columns.
@@ -96,10 +97,10 @@ std::unique_ptr<column> have_overlap(
 std::unique_ptr<column> intersect_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found in either of two input lists columns.
@@ -133,10 +134,10 @@ std::unique_ptr<column> intersect_distinct(
 std::unique_ptr<column> union_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found only in the left input column.
@@ -170,10 +171,10 @@ std::unique_ptr<column> union_distinct(
 std::unique_ptr<column> difference_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 39a52c75a98..78cea191bc5 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -55,8 +56,8 @@ std::unique_ptr<column> sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Segmented sort of the elements within a list in each row of a list column using stable
@@ -68,8 +69,8 @@ std::unique_ptr<column> stable_sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 3ac4f6861ec..31f09d37560 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 
@@ -61,8 +62,8 @@ namespace cudf::lists {
 std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new list column without duplicate elements in each list.
@@ -86,10 +87,10 @@ std::unique_ptr<column> apply_boolean_mask(
  */
 std::unique_ptr<column> distinct(
   lists_column_view const& input,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 8886ec24bfe..29aa3ffe934 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -105,7 +106,7 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::size_type> const& key_cols,
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr                  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 524296e60ca..9e375df140b 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -89,8 +90,8 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Sets a pre-allocated bitmask buffer to a given state in the range
@@ -132,8 +133,8 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies `view`'s bitmask from the bits
@@ -149,8 +150,8 @@ rmm::device_buffer copy_bitmask(
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
@@ -166,8 +167,8 @@ rmm::device_buffer copy_bitmask(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
@@ -183,8 +184,8 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a validity bitmask, counts the number of null elements (unset bits)
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 2c91bdf64f5..9ed56297908 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -33,6 +34,14 @@ namespace cudf {
  * @brief Column partitioning APIs
  */
 
+/**
+ * @brief Identifies the hash function to be used in hash partitioning
+ */
+enum class hash_id {
+  HASH_IDENTITY = 0,  ///< Identity hash function that simply returns the key to be hashed
+  HASH_MURMUR3        ///< Murmur3 hash function
+};
+
 /**
  * @brief Partitions rows of `t` according to the mapping specified by
  * `partition_map`.
@@ -70,7 +79,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Partitions rows from the input table into multiple output tables.
@@ -96,10 +105,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  hash_id hash_function             = hash_id::HASH_MURMUR3,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round-robin partition.
@@ -241,8 +250,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 1f3c26fa077..a1c98ee4e9d 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -56,10 +57,10 @@ namespace cudf {
 std::unique_ptr<column> quantile(
   column_view const& input,
   std::vector<double> const& q,
-  interpolation interp                = interpolation::LINEAR,
-  column_view const& ordered_indices  = {},
-  bool exact                          = true,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  interpolation interp               = interpolation::LINEAR,
+  column_view const& ordered_indices = {},
+  bool exact                         = true,
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the rows of the input corresponding to the requested quantiles.
@@ -98,7 +99,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Calculate approximate percentiles on an input tdigest column.
@@ -125,7 +126,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 52aebeb55e5..52f39925a2d 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -74,6 +75,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  * @param col Input column view
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -81,7 +83,8 @@ std::unique_ptr<scalar> reduce(
   column_view const& col,
   reduce_aggregation const& agg,
   data_type output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the reduction of the values in all rows of a column with an initial value
@@ -95,6 +98,7 @@ std::unique_ptr<scalar> reduce(
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -103,7 +107,8 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column
@@ -144,6 +149,7 @@ std::unique_ptr<scalar> reduce(
  * @param null_handling If `INCLUDE`, the reduction is valid if all elements in a segment are valid,
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction
  */
@@ -153,7 +159,8 @@ std::unique_ptr<column> segmented_reduce(
   segmented_reduce_aggregation const& agg,
   data_type output_dtype,
   null_policy null_handling,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column with an initial value. Only SUM,
@@ -168,6 +175,7 @@ std::unique_ptr<column> segmented_reduce(
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction.
  */
@@ -178,7 +186,8 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the scan of a column.
@@ -194,6 +203,7 @@ std::unique_ptr<column> segmented_reduce(
  * exclusive scan if scan_type::EXCLUSIVE.
  * @param[in] null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
  * Include nulls if null_policy::INCLUDE. Any operation with a null results in a null.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Scanned output column
  */
@@ -201,21 +211,24 @@ std::unique_ptr<column> scan(
   column_view const& input,
   scan_aggregation const& agg,
   scan_type inclusive,
-  null_policy null_handling           = null_policy::EXCLUDE,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_policy null_handling         = null_policy::EXCLUDE,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
  *
  *
  * @param col column to compute minmax
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A std::pair of scalars with the first scalar being the minimum value and the second
  * scalar being the maximum value of the input column.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 97c711fda4e..f23c5a14e33 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -42,7 +43,7 @@ namespace cudf::reduction::detail {
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr);
+                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty histogram column.
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
index 9807d4cb4ea..7d1754d86f2 100644
--- a/cpp/include/cudf/reduction/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -26,6 +26,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_reduce.cuh>
 #include <thrust/for_each.h>
@@ -62,7 +63,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -105,7 +106,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL(
     "This function should never be called. fixed_point reduce should always go through the reduce "
@@ -122,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -188,7 +189,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type valid_count,
                                cudf::size_type ddof,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
   auto const initial_value = op.template get_identity<IntermediateType>();
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index 4cbfb82ae6b..78f90a1e2c9 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <optional>
 
 namespace cudf::reduction::detail {
 
 /**
  * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
- * std::optional<std::reference_wrapper<scalar const>>, rmm::mr::device_memory_resource*)
+ * std::optional<std::reference_wrapper<scalar const>>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,6 +37,6 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 704332c8e1d..31d465619b9 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -47,7 +48,7 @@ std::unique_ptr<scalar> sum(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes minimum of elements in input column
@@ -67,7 +68,7 @@ std::unique_ptr<scalar> min(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes maximum of elements in input column
@@ -87,7 +88,7 @@ std::unique_ptr<scalar> max(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
@@ -108,7 +109,7 @@ std::unique_ptr<scalar> any(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
@@ -129,7 +130,7 @@ std::unique_ptr<scalar> all(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute frequency for each unique element in the input column.
@@ -144,7 +145,7 @@ std::unique_ptr<scalar> all(column_view const& col,
  */
 std::unique_ptr<scalar> histogram(column_view const& input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge multiple histograms together.
@@ -156,7 +157,7 @@ std::unique_ptr<scalar> histogram(column_view const& input,
  */
 std::unique_ptr<scalar> merge_histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of elements in input column
@@ -177,7 +178,7 @@ std::unique_ptr<scalar> product(column_view const& col,
                                 data_type const output_dtype,
                                 std::optional<std::reference_wrapper<scalar const>> init,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -196,7 +197,7 @@ std::unique_ptr<scalar> product(column_view const& col,
 std::unique_ptr<scalar> sum_of_squares(column_view const& col,
                                        data_type const output_dtype,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements in input column
@@ -215,7 +216,7 @@ std::unique_ptr<scalar> sum_of_squares(column_view const& col,
 std::unique_ptr<scalar> mean(column_view const& col,
                              data_type const output_dtype,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes variance of elements in input column
@@ -237,7 +238,7 @@ std::unique_ptr<scalar> variance(column_view const& col,
                                  data_type const output_dtype,
                                  size_type ddof,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -259,7 +260,7 @@ std::unique_ptr<scalar> standard_deviation(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns nth element in input column
@@ -289,7 +290,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
                                     size_type n,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar
@@ -303,7 +304,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar
@@ -315,7 +316,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
  */
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar without duplicated elements
@@ -333,7 +334,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
@@ -349,7 +350,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 3902a7200a9..770ac6580ef 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sum(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of each segment in the input column
@@ -87,7 +88,7 @@ std::unique_ptr<column> segmented_product(column_view const& col,
                                           null_policy null_handling,
                                           std::optional<std::reference_wrapper<scalar const>> init,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute minimum of each segment in the input column
@@ -116,7 +117,7 @@ std::unique_ptr<column> segmented_min(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute maximum of each segment in the input column
@@ -145,7 +146,7 @@ std::unique_ptr<column> segmented_max(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if any of the values in the segment are true when typecasted to bool
@@ -175,7 +176,7 @@ std::unique_ptr<column> segmented_any(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if all of the values in the segment are true when typecasted to bool
@@ -205,7 +206,7 @@ std::unique_ptr<column> segmented_all(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements of segments in the input column
@@ -233,7 +234,7 @@ std::unique_ptr<column> segmented_mean(column_view const& col,
                                        data_type const output_dtype,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements of segments in the input column
@@ -261,7 +262,7 @@ std::unique_ptr<column> segmented_sum_of_squares(column_view const& col,
                                                  data_type const output_dtype,
                                                  null_policy null_handling,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the standard deviation of elements of segments in the input column
@@ -292,7 +293,7 @@ std::unique_ptr<column> segmented_standard_deviation(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the variance of elements of segments in the input column
@@ -323,7 +324,7 @@ std::unique_ptr<column> segmented_variance(column_view const& col,
                                            null_policy null_handling,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Counts the number of unique values within each segment of a column
@@ -351,7 +352,7 @@ std::unique_ptr<column> segmented_nunique(column_view const& col,
                                           device_span<size_type const> offsets,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 3405dc8b796..ae20e72f023 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -55,8 +56,8 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with a scalar.
@@ -74,8 +75,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with the first non-null value that precedes/follows.
@@ -93,8 +94,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with corresponding values from another column
@@ -121,8 +122,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with a scalar
@@ -148,8 +149,8 @@ std::unique_ptr<column> replace_nans(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return a copy of `input_col` replacing any `values_to_replace[i]`
@@ -167,8 +168,8 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo_replace`,
@@ -222,8 +223,8 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo`,
@@ -268,8 +269,8 @@ std::unique_ptr<column> clamp(
   column_view const& input,
   scalar const& lo,
   scalar const& hi,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies from a column of floating-point elements and replaces `-NaN` and `-0.0` with `+NaN`
@@ -288,8 +289,8 @@ std::unique_ptr<column> clamp(
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies a column of floating-point elements to replace all `-NaN` and `-0.0` with `+NaN`
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 42cfb890a31..26316be7fd4 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ namespace cudf {
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeats the rows from `input` table `count` times to form a new table.
@@ -75,7 +76,7 @@ std::unique_ptr<column> interleave_columns(
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Configures whether byte casting flips endianness
@@ -100,7 +101,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index ec93c709163..2cd34f48265 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief rolling_window
@@ -76,7 +77,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -89,7 +90,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Abstraction for window boundary sizes
@@ -237,7 +238,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -248,7 +249,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -257,7 +258,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -268,7 +269,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -282,7 +283,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -294,7 +295,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -304,7 +305,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -399,7 +400,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -414,7 +415,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *                size_type following_window_in_days,
  *                size_type min_periods,
  *                rolling_aggregation const& aggr,
- *                rmm::mr::device_memory_resource* mr)
+ *                rmm::device_async_resource_ref mr)
  *
  * The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
  * and supports "unbounded" windows, if set to `window_bounds::unbounded()`.
@@ -428,7 +429,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
@@ -548,7 +549,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   range_window_bounds const& following,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a variable-size rolling window function to the values in a column.
@@ -591,7 +592,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ee088628b94..85935f8f05c 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -72,9 +73,9 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN };
  */
 std::unique_ptr<column> round(
   column_view const& input,
-  int32_t decimal_places              = 0,
-  rounding_method method              = rounding_method::HALF_UP,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  int32_t decimal_places            = 0,
+  rounding_method method            = rounding_method::HALF_UP,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 08bffab5067..da1d0d743a7 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 /**
  * @file
@@ -112,8 +113,8 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(scalar const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new scalar object.
@@ -127,9 +128,9 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(data_type type,
-         bool is_valid                       = false,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         bool is_valid                     = false,
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 namespace detail {
@@ -164,8 +165,8 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(fixed_width_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Set the value of the scalar.
@@ -214,9 +215,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -227,9 +228,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 }  // namespace detail
@@ -264,8 +265,8 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(numeric_scalar const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object.
@@ -276,9 +277,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(T value,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -289,9 +290,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -327,8 +328,8 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(fixed_point_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale.
@@ -341,9 +342,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
@@ -354,9 +355,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rep_type value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number.
@@ -367,9 +368,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -382,9 +383,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the value of the scalar.
@@ -451,8 +452,8 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(string_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object.
@@ -465,9 +466,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(std::string const& string,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view.
@@ -480,9 +481,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(value_type const& source,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory.
@@ -495,9 +496,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_scalar<value_type>& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object by moving an existing string data buffer.
@@ -511,9 +512,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_buffer&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
@@ -584,8 +585,8 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(chrono_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object.
@@ -596,9 +597,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(T value,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -609,9 +610,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -643,8 +644,8 @@ class timestamp_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   timestamp_scalar(timestamp_scalar const& other,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -659,8 +660,8 @@ class timestamp_scalar : public chrono_scalar<T> {
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks since the UNIX epoch.
@@ -699,8 +700,8 @@ class duration_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(duration_scalar const& other,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new duration scalar object from tick counts.
@@ -712,8 +713,8 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(rep_type value,
                   bool is_valid,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks.
@@ -748,8 +749,8 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(list_scalar const& other,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from column_view.
@@ -762,9 +763,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column_view const& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from existing column.
@@ -775,9 +776,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column&& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -813,8 +814,8 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(struct_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from table_view.
@@ -827,9 +828,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table_view const& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from a host_span of column_views.
@@ -842,9 +843,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(host_span<column_view const> data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from an existing table in device memory.
@@ -858,9 +859,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -888,7 +889,7 @@ class struct_scalar : public scalar {
   static table init_data(table&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr);
+                         rmm::device_async_resource_ref mr);
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index 78b6c4fd0e9..7dd4674a2fd 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -43,8 +44,8 @@ namespace cudf {
  */
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -60,8 +61,8 @@ std::unique_ptr<scalar> make_numeric_scalar(
  */
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -77,8 +78,8 @@ std::unique_ptr<scalar> make_timestamp_scalar(
  */
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -94,8 +95,8 @@ std::unique_ptr<scalar> make_duration_scalar(
  */
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct STRING type scalar given a `std::string`.
@@ -111,8 +112,8 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
  */
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Constructs default constructed scalar of type `type`
@@ -126,8 +127,8 @@ std::unique_ptr<scalar> make_string_scalar(
  */
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
@@ -141,8 +142,8 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
  */
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar using the given value of fixed width type
@@ -156,8 +157,8 @@ std::unique_ptr<scalar> make_empty_scalar_like(
 template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
 }
@@ -176,8 +177,8 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
@@ -192,8 +193,8 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
  */
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given table_view.
@@ -207,8 +208,8 @@ std::unique_ptr<scalar> make_list_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given span of column views.
@@ -222,8 +223,8 @@ std::unique_ptr<scalar> make_struct_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 49acce6a63b..2e50ba2d687 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -72,8 +73,8 @@ std::unique_ptr<column> lower_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
@@ -114,8 +115,8 @@ std::unique_ptr<column> upper_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Check if the given `needle` value exists in the `haystack` column.
@@ -163,8 +164,8 @@ bool contains(column_view const& haystack,
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 42bcb5da8e3..79a00cbce42 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -54,7 +55,7 @@ std::unique_ptr<column> sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the row indices that would produce `input` in a stable
@@ -69,7 +70,7 @@ std::unique_ptr<column> stable_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
@@ -113,7 +114,7 @@ std::unique_ptr<table> sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stable lexicographic sort of the rows of a table
@@ -125,7 +126,7 @@ std::unique_ptr<table> stable_sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value sort.
@@ -155,7 +156,7 @@ std::unique_ptr<table> sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value stable sort.
@@ -168,7 +169,7 @@ std::unique_ptr<table> stable_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the ranks of input column in sorted order.
@@ -207,8 +208,8 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after sorting each segment in the table.
@@ -259,7 +260,7 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after stably sorting each segment in the table.
@@ -272,7 +273,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a lexicographic segmented sort of a table
@@ -328,7 +329,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stably lexicographic segmented sort of a table
@@ -342,7 +343,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 3e7bdf13707..c386b3a22b4 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -73,7 +74,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove null elements.
@@ -104,7 +105,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs with threshold count.
@@ -147,7 +148,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs.
@@ -179,7 +180,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters `input` using `boolean_mask` of boolean values as a mask.
@@ -205,7 +206,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
@@ -248,8 +249,8 @@ std::unique_ptr<table> unique(
   table_view const& input,
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -273,10 +274,10 @@ std::unique_ptr<table> unique(
 std::unique_ptr<table> distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -294,11 +295,11 @@ std::unique_ptr<table> distinct(
  */
 std::unique_ptr<column> distinct_indices(
   table_view const& input,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows, preserving input order.
@@ -325,10 +326,10 @@ std::unique_ptr<column> distinct_indices(
 std::unique_ptr<table> stable_distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 85086e44a26..26f906b3102 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -47,7 +48,7 @@ namespace strings {
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column containing byte lengths
@@ -65,7 +66,7 @@ std::unique_ptr<column> count_characters(
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
@@ -85,7 +86,7 @@ std::unique_ptr<column> count_bytes(
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of strings_apis group
 
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 57375e9ac6a..f8cbdc09748 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
-  string_scalar const& delimiters     = string_scalar("", true, cudf::get_default_stream()),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiters   = string_scalar("", true, cudf::get_default_stream()),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies first character of each word to upper-case and lower-cases the rest.
@@ -95,7 +96,7 @@ std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if the strings in the input column are title formatted.
@@ -123,8 +124,8 @@ std::unique_ptr<column> title(
  */
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 94191686a92..5403fa8db7e 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
  */
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts a column of strings to upper case.
@@ -63,8 +64,8 @@ std::unique_ptr<column> to_lower(
  */
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of strings converting lower case characters to
@@ -83,8 +84,8 @@ std::unique_ptr<column> to_upper(
  */
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index c6db5dab08a..da7a238a400 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,7 +68,7 @@ std::unique_ptr<column> all_characters_of_type(
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filter specific character types from a column of strings.
@@ -114,7 +115,7 @@ std::unique_ptr<column> filter_characters_of_type(
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 568e8ac50ec..8cc735831b8 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -77,10 +78,10 @@ enum class output_if_empty_list {
  */
 std::unique_ptr<column> join_strings(
   strings_column_view const& input,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates a list of strings columns using separators for each row
@@ -148,7 +149,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Row-wise concatenates the given list of strings columns and
@@ -199,11 +200,11 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  separator_on_nulls separate_nulls = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -270,7 +271,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -329,7 +330,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 341c146df92..f79a0f19e9c 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ struct regex_program;
 std::unique_ptr<column> contains_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -87,8 +88,8 @@ std::unique_ptr<column> contains_re(
 std::unique_ptr<column> matches_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of times the given regex_program's pattern
@@ -115,8 +116,8 @@ std::unique_ptr<column> matches_re(
 std::unique_ptr<column> count_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -163,7 +164,7 @@ std::unique_ptr<column> like(
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -204,7 +205,7 @@ std::unique_ptr<column> like(
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index 9e9f25e800a..9c922361914 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
 std::unique_ptr<column> to_booleans(
   strings_column_view const& input,
   string_scalar const& true_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the boolean values from the
@@ -66,8 +67,8 @@ std::unique_ptr<column> from_booleans(
   column_view const& booleans,
   string_scalar const& true_string,
   string_scalar const& false_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index 81cce14b53b..b89384d718b 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -88,8 +89,8 @@ std::unique_ptr<column> to_timestamps(
   strings_column_view const& input,
   data_type timestamp_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Verifies the given strings column can be parsed to timestamps using the provided format
@@ -135,8 +136,8 @@ std::unique_ptr<column> to_timestamps(
 std::unique_ptr<column> is_timestamp(
   strings_column_view const& input,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a timestamp column into
@@ -246,11 +247,11 @@ std::unique_ptr<column> is_timestamp(
  */
 std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
-  std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
-  strings_column_view const& names    = strings_column_view(column_view{
+  std::string_view format           = "%Y-%m-%dT%H:%M:%SZ",
+  strings_column_view const& names  = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index a1f4e4ead1d..2db719a4f1f 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_durations(
   strings_column_view const& input,
   data_type duration_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a duration column into
@@ -126,9 +127,9 @@ std::unique_ptr<column> to_durations(
  */
 std::unique_ptr<column> from_durations(
   column_view const& durations,
-  std::string_view format             = "%D days %H:%M:%S",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::string_view format           = "%D days %H:%M:%S",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 8f37715967a..9911bea1948 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ namespace strings {
 std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the fixed-point values
@@ -92,8 +93,8 @@ std::unique_ptr<column> to_fixed_point(
  */
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -123,9 +124,9 @@ std::unique_ptr<column> from_fixed_point(
  */
 std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
-  data_type decimal_type              = data_type{type_id::DECIMAL64},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type decimal_type            = data_type{type_id::DECIMAL64},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index a35cb68ef4e..feb5b528686 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -48,8 +49,8 @@ namespace strings {
 std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the float values from the
@@ -71,8 +72,8 @@ std::unique_ptr<column> to_floats(
  */
 std::unique_ptr<column> from_floats(
   column_view const& floats,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -97,8 +98,8 @@ std::unique_ptr<column> from_floats(
  */
 std::unique_ptr<column> is_float(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 74ec5d315a2..82696811fdc 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
 std::unique_ptr<column> to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the integer values from the
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_integers(
  */
 std::unique_ptr<column> from_integers(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -105,8 +106,8 @@ std::unique_ptr<column> from_integers(
  */
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -139,8 +140,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   data_type int_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
@@ -169,8 +170,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> hex_to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -196,8 +197,8 @@ std::unique_ptr<column> hex_to_integers(
  */
 std::unique_ptr<column> is_hex(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting integer columns to hexadecimal
@@ -229,8 +230,8 @@ std::unique_ptr<column> is_hex(
  */
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 25ad7b86748..64f8a412ce9 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts integers into IPv4 addresses as strings.
@@ -80,8 +81,8 @@ std::unique_ptr<column> ipv4_to_integers(
  */
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -107,8 +108,8 @@ std::unique_ptr<column> integers_to_ipv4(
  */
 std::unique_ptr<column> is_ipv4(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index dedf4e95138..a88bbe99492 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,7 +64,7 @@ std::unique_ptr<column> format_list_column(
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 902835081af..30988d2ff0a 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -46,8 +47,8 @@ namespace strings {
  */
 std::unique_ptr<column> url_encode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes each string using URL encoding.
@@ -69,8 +70,8 @@ std::unique_ptr<column> url_encode(
  */
 std::unique_ptr<column> url_decode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 3b8ed0f4e0d..25214055787 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -30,7 +31,7 @@ namespace detail {
 
 /**
  * @copydoc concatenate(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -39,11 +40,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,11 +52,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
- * const&,separator_on_nulls,output_if_empty_list,rmm::mr::device_memory_resource*)
+ * const&,separator_on_nulls,output_if_empty_list,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -65,7 +66,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 511e240886a..b5dd5b9516a 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,7 +45,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
index ab934750f9e..bbf56cf1446 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,6 +102,9 @@ __device__ inline double stod(string_view const& d_str)
           ch = *in_ptr++;
           if (ch < '0' || ch > '9') break;
           exp_ten = (exp_ten * 10) + (int)(ch - '0');
+          // Prevent integer overflow in exp_ten. 100,000,000 is the largest
+          // power of ten that can be multiplied by 10 without overflow.
+          if (exp_ten >= 100'000'000) { break; }
         }
       }
     }
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 3337815342c..d212239264b 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,63 +20,64 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
 /**
- * @copydoc to_integers(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_integers(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_integers(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_integers(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_floats(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_floats(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_floats(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_floats(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_booleans(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_booleans(strings_column_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -84,11 +85,11 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_timestamps(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -96,11 +97,11 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_timestamps(strings_column_view const&,std::string_view,
- * strings_column_view const&,rmm::mr::device_memory_resource*)
+ * strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -108,11 +109,11 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_durations(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -120,37 +121,37 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_durations(strings_column_view const&,std::string_view.
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_fixed_point(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_fixed_point(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_fixed_point(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 08ba99e90d8..4db7651330b 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +60,7 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
                                            StringIterRight rhs_begin,
                                            Filter filter_fn,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index e18f1fdc5ad..192c5b833c6 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 7e82ad4c679..240cac17188 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns a new strings column created by shifting the rows by a specified offset.
@@ -80,7 +81,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 43e3f6198f3..c5d005fbf75 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,7 +48,7 @@ std::unique_ptr<column> fill(strings_column_view const& strings,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 06d959acffb..fcd74bebfe8 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -19,22 +19,19 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
-#include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -225,9 +222,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        MapIterator map_begin,
                                        MapIterator map_end,
                                        cudf::detail::input_offsetalator const offsets,
-                                       size_type chars_bytes,
+                                       int64_t chars_bytes,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(map_begin, map_end);
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
@@ -238,9 +235,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
   // Otherwise, char parallel strategy will be used.
-  constexpr size_type string_parallel_threshold = 32;
+  constexpr int64_t string_parallel_threshold = 32;
 
-  size_type average_string_length = chars_bytes / output_count;
+  int64_t const average_string_length = chars_bytes / output_count;
 
   if (average_string_length > string_parallel_threshold) {
     constexpr int max_threadblocks = 65536;
@@ -290,7 +287,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator begin,
                                      MapIterator end,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(begin, end);
   if (output_count == 0) return make_empty_column(type_id::STRING);
@@ -301,7 +298,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
     strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
     strings.offset());
 
-  auto offsets_itr = thrust::make_transform_iterator(
+  auto sizes_itr = thrust::make_transform_iterator(
     begin,
     cuda::proclaim_return_type<size_type>(
       [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
@@ -309,8 +306,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
         if (not d_strings.is_valid(idx)) { return 0; }
         return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
       }));
-  auto [out_offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
+  auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + output_count, stream, mr);
 
   // build chars column
   auto const offsets_view =
@@ -354,7 +351,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
   return gather<false>(strings, begin, end, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
deleted file mode 100644
index f05e957783f..00000000000
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/merge.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuda/functional>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/tuple.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-/**
- * @brief Merges two strings columns.
- *
- * Caller must set the validity mask in the output column.
- *
- * @tparam row_order_iterator This must be an iterator for type thrust::tuple<side,size_type>.
- *
- * @param lhs First column.
- * @param rhs Second column.
- * @param row_order Indexes for each column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
- */
-template <typename index_type, typename row_order_iterator>
-std::unique_ptr<column> merge(strings_column_view const& lhs,
-                              strings_column_view const& rhs,
-                              row_order_iterator begin,
-                              row_order_iterator end,
-                              rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
-{
-  using cudf::detail::side;
-  size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-
-  auto lhs_column = column_device_view::create(lhs.parent(), stream);
-  auto d_lhs      = *lhs_column;
-  auto rhs_column = column_device_view::create(rhs.parent(), stream);
-  auto d_rhs      = *rhs_column;
-
-  // caller will set the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  size_type null_count = lhs.null_count() + rhs.null_count();
-  if (null_count > 0)
-    null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
-
-  // build offsets column
-  auto offsets_transformer =
-    cuda::proclaim_return_type<size_type>([d_lhs, d_rhs] __device__(auto index_pair) {
-      auto const [side, index] = index_pair;
-      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0;
-      auto d_str =
-        side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
-      return d_str.size_bytes();
-    });
-  auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().template data<int32_t>();
-
-  // create the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
-                       auto const [side, index] = begin[idx];
-                       if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return;
-                       auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
-                                                       : d_rhs.element<string_view>(index);
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
new file mode 100644
index 00000000000..35fd9c0593d
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf ::strings ::detail {
+/**
+ * @brief Merges two strings columns
+ *
+ * @param lhs First column
+ * @param rhs Second column
+ * @param row_order Indices for each column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 28027291b28..aad89beb47e 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -28,24 +29,24 @@ namespace detail {
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
- * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
                                 int32_t maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
- * strings_column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
@@ -68,18 +69,36 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
- * size_type, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * size_type, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Return a copy of `input` replacing any `values_to_replace[i]`
+ * found with `replacement_values[i]`
+ *
+ * @param input The column to find and replace values
+ * @param values_to_replace The values to find
+ * @param replacement_values The corresponding replacement values
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Copy of `input` with specified values replaced
+ */
+std::unique_ptr<cudf::column> find_and_replace_all(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& values_to_replace,
+  cudf::strings_column_view const& replacement_values,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index 611e32e28cd..f32afa64a72 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -40,7 +41,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 8b8c11dcd5c..87f0e7ae47c 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -63,7 +64,7 @@ std::unique_ptr<column> scatter(SourceIterator begin,
                                 MapIterator scatter_map,
                                 strings_column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (target.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 49c4be88ca5..f105a6dc546 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,12 +17,15 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -33,94 +36,6 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-/**
- * @brief Creates child offsets and chars data by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * @throws std::overflow_error if the output strings column exceeds the column size limit
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param exec_size Number of rows for executing the `size_and_exec_fn` function.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return Offsets child column and chars data for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view          = offsets_column->mutable_view();
-  auto d_offsets             = offsets_view.template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
-
-  // This is called twice -- once for offsets and once for chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       exec_size,
-                       size_and_exec_fn);
-  };
-
-  // Compute the output sizes
-  for_each_fn(size_and_exec_fn);
-
-  // Convert the sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  // Now build the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-
-  // Execute the function fn again to fill the chars column.
-  // Note that if the output chars column has zero size, the function fn should not be called to
-  // avoid accidentally overwriting the offsets.
-  if (bytes > 0) {
-    size_and_exec_fn.d_chars = chars.data();
-    for_each_fn(size_and_exec_fn);
-  }
-
-  return std::pair(std::move(offsets_column), std::move(chars));
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output.
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return offsets child column and chars child column for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
-}
-
 /**
  * @brief Create an offsets column to be a child of a compound column
  *
@@ -142,7 +57,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
   auto const lcount            = static_cast<int64_t>(std::distance(begin, end));
@@ -163,22 +78,170 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements =
+  auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  // TODO: replace exception with if-statement when enabling creating INT64 offsets
-  CUDF_EXPECTS(total_elements <= size_type_max,
-               "Size of output exceeds the character size limit",
+  auto const threshold = get_offset64_threshold();
+  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
-  // if (total_elements >= get_offset64_threshold()) {
-  //   // recompute as int64 offsets when above the threshold
-  //   offsets_column = make_numeric_column(
-  //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
-  // }
-
-  return std::pair(std::move(offsets_column), total_elements);
+  if (total_bytes >= get_offset64_threshold()) {
+    // recompute as int64 offsets when above the threshold
+    offsets_column = make_numeric_column(
+      data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  }
+
+  return std::pair(std::move(offsets_column), total_bytes);
+}
+
+/**
+ * @brief Kernel used by make_strings_children for calling the given functor
+ *
+ * @tparam SizeAndExecuteFunction Functor type to call in each thread
+ *
+ * @param fn Functor to call in each thread
+ * @param exec_size Total number of threads to be processed by this kernel
+ */
+template <typename SizeAndExecuteFunction>
+CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
+{
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid < exec_size) { fn(tid); }
+}
+
+/**
+ * @brief Creates child offsets and chars data by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type thread_idx)
+ *   {
+ *     // functor-specific logic to resolve out_idx from thread_idx
+ *     if( !d_chars ) {
+ *       d_sizes[out_idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[out_idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by d_chars.
+ * @param exec_size Number of threads for executing the `size_and_exec_fn` function
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  // This is called twice -- once for computing sizes and once for writing chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    auto constexpr block_size = 256;
+    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
+    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
+                                                                                exec_size);
+  };
+
+  // Compute the output sizes
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
+  size_and_exec_fn.d_chars = nullptr;
+  for_each_fn(size_and_exec_fn);
+
+  // Convert the sizes to offsets
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // Now build the chars column
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  size_and_exec_fn.d_chars = chars.data();
+
+  // Execute the function fn again to fill in the chars data.
+  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
+
+  return std::pair(std::move(offsets_column), std::move(chars));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type idx)
+ *   {
+ *     if( !d_chars ) {
+ *       d_sizes[idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by `d_chars`.
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e19f08a5cc..a3221038eed 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -73,7 +74,7 @@ template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(begin, end);
@@ -85,9 +86,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       return (item.first != nullptr ? static_cast<size_type>(item.second) : size_type{0});
     });
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
+  auto const d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -97,11 +99,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] {
+  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
     auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
     // use a character-parallel kernel for long string lengths
     if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
       auto const str_begin = thrust::make_transform_iterator(
         begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
           return string_view{ip.first, ip.second};
@@ -120,12 +121,11 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       auto d_chars    = chars_data.data();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
-        size_type const offset      = thrust::get<1>(item);
+        int64_t const offset        = thrust::get<1>(item);
         if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
       };
       thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(
-                           thrust::make_tuple(begin, offsets_view.template begin<size_type>())),
+                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
                          strings_count,
                          copy_chars);
       return chars_data;
@@ -163,25 +163,19 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
-  size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
+  int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char);
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
 
   // build offsets column -- this is the number of strings + 1
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  thrust::transform(rmm::exec_policy(stream),
-                    offsets_begin,
-                    offsets_end,
-                    offsets_view.data<int32_t>(),
-                    cuda::proclaim_return_type<int32_t>(
-                      [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
+  auto [offsets_column, computed_bytes] =
+    cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr);
+  CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count");
 
   // build chars column
   rmm::device_uvector<char> chars_data(bytes, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 8d8065dbcaf..4467a9d0023 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -22,11 +22,30 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Create an offsets column to be a child of a strings column
+ *
+ * This will return the properly typed column to be filled in by the caller
+ * given the number of bytes to address.
+ *
+ * @param chars_bytes Number of bytes for the chars in the strings column
+ * @param count Number of elements for the offsets column.
+ *              This is the number of rows in the parent strings column +1.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return The offsets child column for a strings column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::device_async_resource_ref mr);
+
 /**
  * @brief Creates a string_view vector from a strings column.
  *
@@ -38,7 +57,7 @@ namespace detail {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return the threshold size for a strings column to use int64 offsets
@@ -52,6 +71,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
  */
 int64_t get_offset64_threshold();
 
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
 /**
  * @brief Return a normalized offset value from a strings offsets column
  *
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index a4db1ac46da..4138e1e59d5 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ struct regex_program;
 std::unique_ptr<table> extract(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
@@ -98,8 +99,8 @@ std::unique_ptr<table> extract(
 std::unique_ptr<column> extract_all_record(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index c1aa8b294b3..c116dbc2fe1 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,10 +56,10 @@ namespace strings {
 std::unique_ptr<column> find(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -86,10 +87,10 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> rfind(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -114,9 +115,9 @@ std::unique_ptr<column> rfind(
 std::unique_ptr<column> find(
   strings_column_view const& input,
   strings_column_view const& target,
-  size_type start                     = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -136,8 +137,8 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -161,8 +162,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -183,8 +184,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -209,8 +210,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -231,8 +232,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -257,8 +258,8 @@ std::unique_ptr<column> ends_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 06b851c5012..c2e82aa6f1a 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -57,8 +58,8 @@ namespace strings {
 std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 379b9624dc6..abc1d28ee4c 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,8 +65,8 @@ struct regex_program;
 std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index f0cb351eeda..f1382d6ea29 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,10 +59,10 @@ namespace strings {
 std::unique_ptr<column> pad(
   strings_column_view const& input,
   size_type width,
-  side_type side                      = side_type::RIGHT,
-  std::string_view fill_char          = " ",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::RIGHT,
+  std::string_view fill_char        = " ",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Add '0' as padding to the left of each string.
@@ -90,8 +91,8 @@ std::unique_ptr<column> pad(
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 7dc9c33f579..cbf1edc8331 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ namespace strings {
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column a given number of times
@@ -90,8 +91,8 @@ std::unique_ptr<string_scalar> repeat_string(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
@@ -127,8 +128,8 @@ std::unique_ptr<column> repeat_strings(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 2476a41e886..9525db44b69 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,9 +68,9 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   string_scalar const& target,
   string_scalar const& repl,
-  cudf::size_type maxrepl             = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type maxrepl           = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief This function replaces each string in the column with the provided
@@ -107,11 +108,11 @@ std::unique_ptr<column> replace(
  */
 std::unique_ptr<column> replace_slice(
   strings_column_view const& input,
-  string_scalar const& repl           = string_scalar(""),
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& repl         = string_scalar(""),
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces substrings matching a list of targets with the corresponding
@@ -156,8 +157,8 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 77db2882253..f61f9585144 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_re(
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
   rmm::cuda_stream_view stream               = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr          = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
@@ -81,9 +82,9 @@ std::unique_ptr<column> replace_re(
   strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
-  regex_flags const flags             = regex_flags::DEFAULT,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  regex_flags const flags           = regex_flags::DEFAULT,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given regex
@@ -107,8 +108,8 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& input,
   regex_program const& prog,
   std::string_view replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 4fc8fbf67c2..86656693c8b 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,8 +48,8 @@ namespace strings {
  */
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index f106663be9b..e2be6abd344 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,7 +65,7 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column that contains substrings of the
@@ -108,8 +109,8 @@ std::unique_ptr<column> slice_strings(
   strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 25eedf1e86b..0a837034ba1 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<table> partition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a set of 3 columns by splitting each string using the
@@ -94,9 +95,9 @@ std::unique_ptr<table> partition(
  */
 std::unique_ptr<table> rpartition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index a34a59577a0..d5c44406ca7 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -54,10 +55,10 @@ namespace strings {
  */
 std::unique_ptr<table> split(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a list of columns by splitting each string using the
@@ -84,10 +85,10 @@ std::unique_ptr<table> split(
  */
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits individual strings elements into a list of strings.
@@ -158,10 +159,10 @@ std::unique_ptr<table> rsplit(
  */
 std::unique_ptr<column> split_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Splits individual strings elements into a list of strings starting
@@ -237,10 +238,10 @@ std::unique_ptr<column> split_record(
  */
 std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index f1736cb7e0c..81595fa7ed4 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -82,9 +83,9 @@ struct regex_program;
 std::unique_ptr<table> split_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a table of strings columns using a
@@ -138,9 +139,9 @@ std::unique_ptr<table> split_re(
 std::unique_ptr<table> rsplit_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings
@@ -196,9 +197,9 @@ std::unique_ptr<table> rsplit_re(
 std::unique_ptr<column> split_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings using the given
@@ -256,9 +257,9 @@ std::unique_ptr<column> split_record_re(
 std::unique_ptr<column> rsplit_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 1156f0a5b73..1e9e73cef4c 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -85,28 +85,6 @@ class strings_column_view : private column_view {
    */
   [[nodiscard]] column_view offsets() const;
 
-  /**
-   * @brief Return an iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing to the first offset value.
-   */
-  [[deprecated]] offset_iterator offsets_begin() const;
-
-  /**
-   * @brief Return an end iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing 1 past the last offset value.
-   */
-  [[deprecated]] offset_iterator offsets_end() const;
-
   /**
    * @brief Returns the number of bytes in the chars child column.
    *
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 556d6805ac3..6fb9bbc45e6 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,10 +64,10 @@ namespace strings {
  */
 std::unique_ptr<column> strip(
   strings_column_view const& input,
-  side_type side                      = side_type::BOTH,
-  string_scalar const& to_strip       = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::BOTH,
+  string_scalar const& to_strip     = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 4bd09352b09..9cd6b7d5974 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -56,8 +57,8 @@ namespace strings {
 std::unique_ptr<column> translate(
   strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
@@ -101,10 +102,10 @@ enum class filter_type : bool {
 std::unique_ptr<column> filter_characters(
   strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
-  filter_type keep_characters         = filter_type::KEEP,
-  string_scalar const& replacement    = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  filter_type keep_characters       = filter_type::KEEP,
+  string_scalar const& replacement  = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index efdc3e62aff..c05c33fbac8 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -66,8 +67,8 @@ namespace strings {
 std::unique_ptr<column> wrap(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 82ccca188e2..5dc3169c0c4 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace structs {
 namespace detail {
@@ -50,7 +52,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index 531e0a6c65f..c97a8452ecd 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace structs {
@@ -38,7 +39,7 @@ namespace detail {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 439b02c2d53..8efe6eb8c72 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -56,8 +57,8 @@ class table {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit table(table const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Moves the contents from a vector of `unique_ptr`s to columns to
    * construct a new table.
@@ -75,8 +76,8 @@ class table {
    * @param mr Device memory resource used for allocating the device memory for the new columns
    */
   table(table_view view,
-        rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-        rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+        rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+        rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the number of columns in the table
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4f3b23747e6..ad12b1eef4e 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,15 +339,6 @@ bool has_nested_nullable_columns(table_view const& input);
  */
 std::vector<column_view> get_nullable_columns(table_view const& table);
 
-/**
- * @brief Checks if two `table_view`s have columns of same types
- *
- * @param lhs left-side table_view operand
- * @param rhs right-side table_view operand
- * @return boolean comparison result
- */
-bool have_same_types(table_view const& lhs, table_view const& rhs);
-
 /**
  * @brief Copy column_views from a table_view into another table_view according to
  * a column indices map.
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 56678c73811..7f65128526e 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -49,6 +50,6 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 49ec3d7c0d5..7bb9fb7a42e 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -54,7 +55,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
@@ -69,7 +70,7 @@ std::unique_ptr<column> transform(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute a new column by evaluating an expression tree on a table.
@@ -87,7 +88,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a bitmask from a column of boolean elements.
@@ -106,7 +107,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encode the rows of the given table as integers
@@ -134,7 +135,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
@@ -166,7 +167,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a boolean column from given bitmask.
@@ -193,7 +194,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -221,8 +222,7 @@ std::unique_ptr<column> mask_to_bools(
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -245,7 +245,7 @@ std::unique_ptr<column> row_bit_count(
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index e5d083ae7b3..c01a04afe87 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -44,7 +45,7 @@ namespace cudf {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 64e802d88dd..74c8bc67d3a 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,13 @@
 
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -31,6 +34,77 @@ namespace cudf {
  * @brief Column APIs for unary ops
  */
 
+/**
+ * @brief Convert a floating-point value to fixed point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Fixed The fixed-point type to convert to
+ * @tparam Floating The floating-point type to convert from
+ * @param floating The floating-point value to convert
+ * @param scale The desired scale of the fixed-point value
+ * @return The converted fixed-point value
+ */
+template <typename Fixed,
+          typename Floating,
+          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
+                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
+{
+  using Rep          = typename Fixed::rep;
+  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
+  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
+  return Fixed(scaled);
+}
+
+/**
+ * @brief Convert a fixed-point value to floating point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Fixed The fixed-point type to convert from
+ * @param fixed The fixed-point value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Fixed,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
+                                          is_fixed_point<Fixed>()>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
+{
+  using Rep         = typename Fixed::rep;
+  auto const casted = static_cast<Floating>(fixed.value());
+  auto const scale  = numeric::scale_type{-fixed.scale()};
+  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+}
+
+/**
+ * @brief Convert a value to floating point
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Input The input type to convert from
+ * @param input The input value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Input,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
+{
+  if constexpr (is_fixed_point<Input>()) {
+    return convert_fixed_to_floating<Floating>(input);
+  } else {
+    return static_cast<Floating>(input);
+  }
+}
+
 /**
  * @brief Types of unary operations that can be performed on data.
  */
@@ -74,8 +148,8 @@ enum class unary_operator : int32_t {
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -90,8 +164,8 @@ std::unique_ptr<cudf::column> unary_operation(
  */
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -106,8 +180,8 @@ std::unique_ptr<cudf::column> is_null(
  */
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Casts data from dtype specified in input to dtype specified in output.
@@ -125,8 +199,8 @@ std::unique_ptr<cudf::column> is_valid(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
@@ -143,8 +217,8 @@ std::unique_ptr<column> cast(
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the absence of `NaN` values
@@ -162,8 +236,8 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
similarity index 100%
rename from cpp/src/io/utilities/thread_pool.hpp
rename to cpp/include/cudf/utilities/thread_pool.hpp
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 2dda0740b96..d191e44228a 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -397,7 +397,10 @@ template <typename T>
 constexpr inline bool is_fixed_point()
 {
   return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T>;
+         std::is_same_v<numeric::decimal128, T> ||
+         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index b925fc8ae92..fd3b0581c11 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,16 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <algorithm>
 
 namespace cudf {
 
 /**
- * @brief Compares the type of two `column_view`s
+ * @brief Compare the types of two `column_view`s
+ *
+ * @deprecated Since 24.06. Use cudf::have_same_types instead.
  *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is compared.
@@ -34,10 +39,11 @@ namespace cudf {
  * @param rhs The second `column_view` to compare
  * @return true if column types match
  */
-bool column_types_equal(column_view const& lhs, column_view const& rhs);
+[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
 /**
  * @brief Compare the type IDs of two `column_view`s
+ *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is ignored.
  *
@@ -47,4 +53,98 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs);
  */
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compares the type of two `column_view`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary types, the type of the keys are compared if both are
+ *   non-empty columns.
+ * - For lists types, the type of child columns are compared recursively.
+ * - For struct types, the type of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of a `column_view` and a `scalar`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `column_view` to compare
+ * @param rhs The `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, scalar const& rhs);
+
+/**
+ * @brief Compare the types of a `scalar` and a `column_view`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `scalar` to compare
+ * @param rhs The `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of two `scalar`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `scalar` to compare
+ * @param rhs The second `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, scalar const& rhs);
+
+/**
+ * @brief Checks if two `table_view`s have columns of same types
+ *
+ * @param lhs left-side table_view operand
+ * @param rhs right-side table_view operand
+ * @return boolean comparison result
+ */
+bool have_same_types(table_view const& lhs, table_view const& rhs);
+
+/**
+ * @brief Compare the types of a range of `column_view` or `scalar` objects
+ *
+ * This function returns true if all objects in the range have the same type, in the sense of
+ * cudf::have_same_types.
+ *
+ * @tparam ForwardIt Forward iterator
+ * @param first The first iterator
+ * @param last The last iterator
+ * @return true if all types match
+ */
+template <typename ForwardIt>
+inline bool all_have_same_types(ForwardIt first, ForwardIt last)
+{
+  return first == last || std::all_of(std::next(first), last, [want = *first](auto const& c) {
+           return cudf::have_same_types(want, c);
+         });
+}
+
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 14b94e061ae..18f75bbc842 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace test {
@@ -36,7 +37,7 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -44,7 +45,7 @@ class BaseFixture : public ::testing::Test {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() { return _mr; }
+  rmm::device_async_resource_ref mr() { return _mr; }
 };
 
 /**
@@ -57,7 +58,7 @@ class BaseFixture : public ::testing::Test {
  */
 template <typename T>
 class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -65,7 +66,7 @@ class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() const { return _mr; }
+  rmm::device_async_resource_ref mr() const { return _mr; }
 };
 
 /**
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index a8957473175..c83599a8072 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -210,6 +210,29 @@ template <>
 std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
 //! @endcond
 
+/**
+ * @brief For enabling large strings testing in specific tests
+ */
+struct large_strings_enabler {
+  /**
+   * @brief Create large strings enable object
+   *
+   * @param default_enable Default enables large strings support
+   */
+  large_strings_enabler(bool default_enable = true);
+  ~large_strings_enabler();
+
+  /**
+   * @brief Enable large strings support
+   */
+  void enable();
+
+  /**
+   * @brief Disable large strings support
+   */
+  void disable();
+};
+
 }  // namespace cudf::test
 
 // Macros for showing line of failure.
@@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
     SCOPED_TRACE(" <--  line of failure\n");                        \
     cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
   } while (0)
+
+#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 151fe50be4f..dc873658abf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index fa76204d622..89394fbd1c3 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,86 +16,6 @@
 
 #pragma once
 
-#ifdef GTEST_INCLUDE_GTEST_GTEST_H_
-#error "Don't include gtest/gtest.h directly, include cudf_gtest.hpp instead"
-#endif
-
-/**
- * @file cudf_gtest.hpp
- * @brief Work around for GTests( <=v1.10 ) emulation of variadic templates in
- * @verbatim ::Testing::Types @endverbatim
- *
- * @note Instead of including `gtest/gtest.h`, all libcudf test files should
- * include `cudf_gtest.hpp` instead.
- *
- * Removes the 50 type limit in a type-parameterized test list.
- *
- * Uses macros to rename GTests's emulated variadic template types and then
- * redefines them properly.
- */
-
-// @cond
-#if __has_include(<gtest/internal/gtest-type-util.h.pump>)
-// gtest doesn't provide a version header so we need to
-// use a file existence trick.
-// gtest-type-util.h.pump only exists in versions < 1.11
-#define Types      Types_NOT_USED
-#define Types0     Types0_NOT_USED
-#define TypeList   TypeList_NOT_USED
-#define Templates  Templates_NOT_USED
-#define Templates0 Templates0_NOT_USED
-#include <gtest/internal/gtest-type-util.h>
-#undef Types
-#undef Types0
-#undef TypeList
-#undef Templates
-#undef Templates0
-
-namespace testing {
-template <class... TYPES>
-struct Types {
-  using type = Types;
-};
-
-template <class T, class... TYPES>
-struct Types<T, TYPES...> {
-  using Head = T;
-  using Tail = Types<TYPES...>;
-
-  using type = Types;
-};
-
-namespace internal {
-using Types0 = Types<>;
-
-template <GTEST_TEMPLATE_... TYPES>
-struct Templates {};
-
-template <GTEST_TEMPLATE_ HEAD, GTEST_TEMPLATE_... TAIL>
-struct Templates<HEAD, TAIL...> {
-  using Head = internal::TemplateSel<HEAD>;
-  using Tail = Templates<TAIL...>;
-
-  using type = Templates;
-};
-
-using Templates0 = Templates<>;
-
-template <typename T>
-struct TypeList {
-  using type = Types<T>;
-};
-
-template <class... TYPES>
-struct TypeList<Types<TYPES...>> {
-  using type = Types<TYPES...>;
-};
-
-}  // namespace internal
-}  // namespace testing
-#endif  // gtest < 1.11
-// @endcond
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 88e3088d794..66b831b917f 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -145,6 +145,51 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   }
 }
 
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
+  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+  return resource;
+}
+
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto resource                      = rmm::mr::get_current_device_resource();
+  auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
+  auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
+  auto const error_on_invalid_stream = (stream_error_mode == "error");
+  auto const check_default_stream    = (stream_mode == "new_cudf_default");
+  auto adaptor =
+    make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream);
+  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+    rmm::mr::set_current_device_resource(&adaptor);
+  }
+  return adaptor;
+}
+
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
@@ -155,25 +200,12 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                                              \
-  int main(int argc, char** argv)                                                             \
-  {                                                                                           \
-    ::testing::InitGoogleTest(&argc, argv);                                                   \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                                   \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                             \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode);                       \
-    rmm::mr::set_current_device_resource(resource.get());                                     \
-                                                                                              \
-    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();                       \
-    if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {      \
-      auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();   \
-      auto const error_on_invalid_stream = (stream_error_mode == "error");                    \
-      auto const check_default_stream    = (stream_mode == "new_cudf_default");               \
-      auto adaptor                       = make_stream_checking_resource_adaptor(             \
-        resource.get(), error_on_invalid_stream, check_default_stream); \
-      rmm::mr::set_current_device_resource(&adaptor);                                         \
-      return RUN_ALL_TESTS();                                                                 \
-    }                                                                                         \
-                                                                                              \
-    return RUN_ALL_TESTS();                                                                   \
+#define CUDF_TEST_PROGRAM_MAIN()                                            \
+  int main(int argc, char** argv)                                           \
+  {                                                                         \
+    ::testing::InitGoogleTest(&argc, argv);                                 \
+    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
+    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
+    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
+    return RUN_ALL_TESTS();                                                 \
   }
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 4d6d8335eac..375d44e367a 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -45,8 +47,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -56,8 +58,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
   bpe_merge_pairs();
@@ -94,8 +96,8 @@ struct bpe_merge_pairs {
  */
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
   cudf::strings_column_view const& merge_pairs,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Byte pair encode the input strings.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index 835124141d4..c4b89b6d495 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,21 @@
 #include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 
 /**
  * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::mr::device_memory_resource*)
+ * cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for allocating/copying device memory and launching kernels
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index f4107adb07e..0c27981f80b 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -20,6 +20,7 @@
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstdint>
 #include <cstring>
@@ -43,7 +44,7 @@ namespace detail {
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 80a6edc496b..d48027e4631 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,52 +21,53 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 9a24662455b..bfdfb4d1a1c 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -60,8 +62,8 @@ namespace nvtext {
 std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute the edit distance between all the strings in the input column.
@@ -98,8 +100,8 @@ std::unique_ptr<cudf::column> edit_distance(
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 46f2c0e7bc9..bebe2e46023 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -58,25 +60,23 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Generates ngrams of characters within each string.
+ * @brief Generates ngrams of characters within each string
  *
- * Each character of a string used to build ngrams.
+ * Each character of a string is used to build ngrams for the output row.
  * Ngrams are not created across strings.
  *
  * ```
- * ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"]
+ * ["ab", "cde", "fgh"] would generate bigrams as
+ * [["ab"], ["cd", "de"], ["fg", "gh"]]
  * ```
  *
- * The size of the output column will be the total number of ngrams generated from
- * the input strings column.
+ * All null row entries are ignored and the corresponding output row will be empty.
  *
- * All null row entries are ignored and the output contains all valid rows.
- *
- * @throw cudf::logic_error if `ngrams < 2`
+ * @throw std::invalid_argument if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
  * @param input Strings column to produce ngrams from
@@ -84,13 +84,13 @@ std::unique_ptr<cudf::column> generate_ngrams(
  *               Default is 2 = bigram.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of tokens
+ * @return Lists column of strings
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 2,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 2,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Hashes ngrams of characters within each string
@@ -123,9 +123,9 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 5,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 5,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 19d6c111200..649c17f0b1c 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_jaccard
@@ -72,8 +74,8 @@ std::unique_ptr<cudf::column> jaccard_index(
   cudf::strings_column_view const& input1,
   cudf::strings_column_view const& input2,
   cudf::size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 47c625b5079..7d3f6059454 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_minhash
@@ -53,7 +55,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -83,9 +85,9 @@ std::unique_ptr<cudf::column> minhash(
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash value for each string
@@ -114,7 +116,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -144,9 +146,9 @@ std::unique_ptr<cudf::column> minhash64(
 std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 9d76ef8689f..09ce323a7ae 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -80,8 +82,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::size_type ngrams,
   cudf::string_scalar const& delimiter,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 3cbff5c744b..e5967e78318 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -51,8 +53,8 @@ namespace nvtext {
  */
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Normalizes strings characters for tokenizing.
@@ -102,8 +104,8 @@ std::unique_ptr<cudf::column> normalize_spaces(
 std::unique_ptr<cudf::column> normalize_characters(
   cudf::strings_column_view const& input,
   bool do_lower_case,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index 88cf7d41901..aac21346c72 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -88,7 +90,7 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes tokens whose lengths are less than a specified number of characters.
@@ -137,7 +139,7 @@ std::unique_ptr<cudf::column> filter_tokens(
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 0e1759fdc5a..20b81aba661 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_stemmer
@@ -79,8 +81,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::size_type character_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
@@ -132,8 +134,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the Porter Stemmer measurements of a strings column.
@@ -166,8 +168,8 @@ std::unique_ptr<cudf::column> is_letter(
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 72a899d70b4..a4e06495a1d 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -65,7 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Result object for the subword_tokenize functions.
@@ -155,7 +157,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 107fefcc3bf..ea1b9c716f0 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_tokenize
@@ -60,7 +62,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by tokenizing the input strings
@@ -95,8 +97,8 @@ std::unique_ptr<cudf::column> tokenize(
 std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column
@@ -158,8 +160,8 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by converting each character to a string.
@@ -183,8 +185,8 @@ std::unique_ptr<cudf::column> count_tokens(
  */
 std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a strings column from a strings column of tokens and an
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
@@ -246,8 +248,8 @@ struct tokenize_vocabulary {
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
   tokenize_vocabulary(cudf::strings_column_view const& input,
-                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   ~tokenize_vocabulary();
 
   struct tokenize_vocabulary_impl;
@@ -269,8 +271,8 @@ struct tokenize_vocabulary {
  */
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the token ids for the input string by looking up each delimited
@@ -301,9 +303,9 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   cudf::strings_column_view const& input,
   tokenize_vocabulary const& vocabulary,
   cudf::string_scalar const& delimiter,
-  cudf::size_type default_id          = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type default_id        = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of tokenize group
 }  // namespace nvtext
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index f3e21779aa5..603880954a6 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -45,6 +44,7 @@ fi
 
 DEFAULT_FORMAT_FILE_LOCATIONS=(
   "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-cpp/build}/latest/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index be91c3b4d08..ac31f9045fe 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -56,7 +57,7 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (col.is_empty()) return std::pair(rmm::device_buffer{0, stream, mr}, 0);
 
@@ -76,9 +77,9 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
  */
 inline bool is_null_dependent(binary_operator op)
 {
-  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND ||
-         op == binary_operator::NULL_LOGICAL_OR;
+  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS ||
+         op == binary_operator::NULL_MIN || op == binary_operator::NULL_MAX ||
+         op == binary_operator::NULL_LOGICAL_AND || op == binary_operator::NULL_LOGICAL_OR;
 }
 
 /**
@@ -108,7 +109,8 @@ bool is_comparison_binop(binary_operator op)
          op == binary_operator::GREATER or        // operator >
          op == binary_operator::LESS_EQUAL or     // operator <=
          op == binary_operator::GREATER_EQUAL or  // operator >=
-         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_EQUALS or    // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_NOT_EQUALS;  // 2 null = false; 1 null = true; else !=
 }
 
 /**
@@ -179,7 +181,7 @@ void fixed_point_binary_operation_validation(binary_operator op,
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -189,7 +191,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
     CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
@@ -250,7 +252,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -277,7 +279,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -304,7 +306,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -320,7 +322,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<scalar, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -330,7 +332,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, scalar>(
     lhs, rhs, op, output_type, stream, mr);
@@ -340,7 +342,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -351,7 +353,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // Check for datatype
   auto is_type_supported_ptx = [](data_type type) -> bool {
@@ -405,7 +407,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -415,7 +417,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -425,7 +427,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -436,7 +438,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
diff --git a/cpp/src/binaryop/compiled/NullNotEquals.cu b/cpp/src/binaryop/compiled/NullNotEquals.cu
new file mode 100644
index 00000000000..34f73cca48a
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullNotEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullNotEquals>(mutable_column_view&,
+                                                  column_view const&,
+                                                  column_view const&,
+                                                  bool is_lhs_scalar,
+                                                  bool is_rhs_scalar,
+                                                  rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1429635b803..ba0253ec853 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -50,7 +51,7 @@ struct scalar_as_column_view {
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
   return_type operator()(scalar const& s,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource*)
+                         rmm::device_async_resource_ref)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
@@ -61,7 +62,7 @@ struct scalar_as_column_view {
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
-  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type");
   }
@@ -69,7 +70,7 @@ struct scalar_as_column_view {
 // specialization for cudf::string_view
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::string_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   using T                  = cudf::string_view;
   auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
@@ -96,7 +97,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::strin
 // specializing for struct column
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struct_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto col = make_column_from_scalar(s, 1, stream, mr);
   return std::pair{col->view(), std::move(col)};
@@ -114,7 +115,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struc
 auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
@@ -216,7 +217,7 @@ struct null_considering_binop {
                                      data_type output_type,
                                      cudf::size_type col_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
@@ -263,7 +264,7 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -280,7 +281,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -297,7 +298,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -355,6 +356,7 @@ case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, l
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_NOT_EQUALS:      apply_binary_op<ops::NullNotEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_LOGICAL_AND:     apply_binary_op<ops::NullLogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -411,8 +413,9 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
   // Struct child column type and structure mismatches are caught within the two_table_comparator
   switch (op) {
     case binary_operator::EQUAL: [[fallthrough]];
+    case binary_operator::NOT_EQUAL: [[fallthrough]];
     case binary_operator::NULL_EQUALS: [[fallthrough]];
-    case binary_operator::NOT_EQUAL:
+    case binary_operator::NULL_NOT_EQUALS:
       detail::apply_struct_equality_op(
         out,
         lhs,
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index d605c877d3f..5177e7d4bda 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/unary.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -69,13 +70,17 @@ struct typed_casted_writer {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
-    } else if constexpr (is_fixed_point<Element>() and
-                         (is_fixed_point<FromType>() or
-                          std::is_constructible_v<Element, FromType>)) {
-      if constexpr (is_fixed_point<FromType>())
-        col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
-      else
-        col.data<Element::rep>()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value();
+    } else if constexpr (is_fixed_point<Element>()) {
+      auto const scale = numeric::scale_type{col.type().scale()};
+      if constexpr (is_fixed_point<FromType>()) {
+        col.data<Element::rep>()[i] = val.rescaled(scale).value();
+      } else if constexpr (cuda::std::is_constructible_v<Element, FromType>) {
+        col.data<Element::rep>()[i] = Element{val, scale}.value();
+      } else if constexpr (cuda::std::is_floating_point_v<FromType>) {
+        col.data<Element::rep>()[i] = convert_floating_to_fixed<Element>(val, scale).value();
+      }
+    } else if constexpr (cuda::std::is_floating_point_v<Element> and is_fixed_point<FromType>()) {
+      col.data<Element>()[i] = convert_fixed_to_floating<Element>(val);
     }
   }
 };
@@ -104,6 +109,7 @@ struct ops_wrapper {
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 47fd50c5d97..ceeba9cf817 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -37,21 +38,21 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             scalar const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             column_view const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string scalar and a string
@@ -77,7 +78,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -103,7 +104,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -128,7 +129,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
@@ -193,7 +194,7 @@ void apply_binary_op(mutable_column_view& out,
  * @brief Deploys single type or double type dispatcher that runs equality operation on each element
  * of @p lhs and @p rhs columns.
  *
- * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
+ * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS, NULL_NOT_EQUALS.
  * @p out type is boolean.
  *
  * This template is instantiated for each binary operator.
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 214803dc415..43b4bd232c4 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -422,15 +422,26 @@ struct NullEquals {
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
   {
     output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
     if (lhs_valid && rhs_valid) return x == y;
-    return false;
+    return !lhs_valid && !rhs_valid;
   }
   // To allow std::is_invocable_v = true
   template <typename TypeLhs, typename TypeRhs>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
 };
 
+struct NullNotEquals {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x != y)
+  {
+    return !NullEquals{}(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y);
+};
+
 struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 2299df5a9bb..a57ff661d67 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ void apply_struct_equality_op(mutable_column_view& out,
                               rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
-                 op == binary_operator::NULL_EQUALS,
+                 op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS,
                "Unsupported operator for these types",
                cudf::data_type_error);
 
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 1ef521d241a..02f4e480ecb 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -182,6 +182,8 @@ struct is_supported_operation_functor {
       case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_NOT_EQUALS:
+        return bool_op<ops::NullNotEquals, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_AND:
         return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_OR:
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 806beeb4efe..d0faeea8336 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -79,7 +80,7 @@ namespace detail {
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type mask_size{0};
 
@@ -157,7 +158,7 @@ void set_null_mask(bitmask_type* bitmask,
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::create_null_mask(size, state, stream, mr);
 }
@@ -211,7 +212,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
@@ -235,7 +236,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -268,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = grid_1d::grid_stride();
+  thread_index_type const tid         = grid_1d::global_thread_id<block_size>();
+  thread_index_type const stride      = grid_1d::grid_stride<block_size>();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
@@ -432,7 +433,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> begin_bits,
                                                      size_type mask_size,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
@@ -446,7 +447,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
 // Returns the bitwise AND of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -479,7 +480,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 // Returns the bitwise OR of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -512,7 +513,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.nullable()) {
     auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
@@ -531,7 +532,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
@@ -540,7 +541,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(view, stream, mr);
@@ -548,7 +549,7 @@ rmm::device_buffer copy_bitmask(column_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_and(view, stream, mr);
@@ -556,7 +557,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_or(view, stream, mr);
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index d4a8fff69e2..90f719b9516 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -46,9 +47,7 @@
 namespace cudf {
 
 // Copy ctor w/ optional stream/mr
-column::column(column const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+column::column(column const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type{other._type},
     _size{other._size},
     _data{other._data, stream, mr},
@@ -160,7 +159,7 @@ namespace {
 struct create_column_from_view {
   cudf::column_view view;
   rmm::cuda_stream_view stream{cudf::get_default_stream()};
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename ColumnType,
             std::enable_if_t<std::is_same_v<ColumnType, cudf::string_view>>* = nullptr>
@@ -254,7 +253,7 @@ struct create_column_from_view {
 }  // anonymous namespace
 
 // Copy from a view
-column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+column::column(column_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   :  // Move is needed here because the dereference operator of unique_ptr returns
      // an lvalue reference, which would otherwise dispatch to the copy constructor
     column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))}
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index d8da6a95aa4..e40056fc8a1 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -75,7 +77,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
                                             size_type size,
                                             mask_state state,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -95,7 +97,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
@@ -115,7 +117,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
                                               size_type size,
                                               mask_state state,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -135,7 +137,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
                                              size_type size,
                                              mask_state state,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -155,7 +157,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
@@ -171,7 +173,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     size_type size,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   if (size == 0) return make_empty_column(type_id::DICTIONARY32);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index 0e65a131e67..bad20d6817c 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/fill.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -33,7 +35,7 @@ struct column_from_scalar_dispatch {
   std::unique_ptr<cudf::column> operator()(scalar const& value,
                                            size_type size,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     if (size == 0) return make_empty_column(value.type());
     if (!value.is_valid(stream))
@@ -51,7 +53,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) return make_empty_column(value.type());
 
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 
 template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
-  scalar const&, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
+  scalar const&, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref) const
 {
   CUDF_FAIL("dictionary not supported when creating from scalar");
 }
@@ -78,7 +80,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto lv = static_cast<list_scalar const*>(&value);
   return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr);
@@ -89,7 +91,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) CUDF_FAIL("0-length struct column is unsupported.");
   auto& ss  = static_cast<scalar_type_t<cudf::struct_view> const&>(value);
@@ -113,7 +115,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
 std::unique_ptr<column> make_column_from_scalar(scalar const& s,
                                                 size_type size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1d850e0b27..47e74a5cb48 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -30,9 +30,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -118,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
                                           size_type number_of_mask_bits,
                                           size_type* out_valid_count)
 {
-  auto tidx         = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
   size_type warp_valid_count = 0;
@@ -241,7 +244,7 @@ template <typename T>
 std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
                                           bool const has_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using mask_policy = cudf::mask_allocation_policy;
 
@@ -288,7 +291,7 @@ template <typename T>
 std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
                                              bool const has_nulls,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   size_type const total_element_count =
     std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) {
@@ -321,7 +324,7 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 struct concatenate_dispatch {
   host_span<column_view const> views;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   // fixed width
   template <typename T>
@@ -460,12 +463,9 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(std::all_of(cols.begin(),
-                           cols.end(),
-                           [expected_type = cols.front().type()](auto const& c) {
-                             return c.type() == expected_type;
-                           }),
-               "Type mismatch in columns to concatenate.");
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
 
   // total size of all concatenated rows
   size_t const total_row_count =
@@ -485,7 +485,7 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");
 
@@ -504,7 +504,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
@@ -533,7 +533,7 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   bool const has_nulls =
     std::any_of(views.begin(), views.end(), [](column_view const col) { return col.has_nulls(); });
@@ -558,7 +558,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_masks(views, stream, mr);
@@ -567,7 +567,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(columns_to_concat, stream, mr);
@@ -575,7 +575,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(tables_to_concat, stream, mr);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 23224d3225d..37db2c74790 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -48,6 +49,7 @@
 
 #include <cstddef>
 #include <numeric>
+#include <optional>
 #include <stdexcept>
 
 namespace cudf {
@@ -988,7 +990,7 @@ struct packed_split_indices_and_src_buf_info {
                                         std::size_t num_partitions,
                                         cudf::size_type num_src_bufs,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* temp_mr)
+                                        rmm::device_async_resource_ref temp_mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(
@@ -1046,7 +1048,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
   packed_partition_buf_size_and_dst_buf_info(std::size_t num_partitions,
                                              std::size_t num_bufs,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* temp_mr)
+                                             rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
@@ -1097,7 +1099,7 @@ struct packed_src_and_dst_pointers {
                               std::size_t num_partitions,
                               cudf::size_type num_src_bufs,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* temp_mr)
+                              rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
@@ -1139,7 +1141,7 @@ struct packed_src_and_dst_pointers {
 
 /**
  * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
- * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
+ * partition buffers (if any) from `out_buffers`. In the chunked_pack case
  * `out_buffers` is empty, and the destination pointer is provided separately
  * to the `copy_partitions` kernel.
  *
@@ -1158,7 +1160,7 @@ std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
   cudf::size_type num_src_bufs,
   std::vector<rmm::device_buffer>& out_buffers,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
     input, num_partitions, num_src_bufs, stream, temp_mr);
@@ -1195,7 +1197,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   cudf::size_type num_src_bufs,
   std::size_t num_bufs,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto partition_buf_size_and_dst_buf_info =
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
@@ -1366,7 +1368,7 @@ struct chunk_iteration_state {
     std::size_t num_partitions,
     std::size_t user_buffer_size,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* temp_mr);
+    rmm::device_async_resource_ref temp_mr);
 
   /**
    * @brief As of the time of the call, return the starting 1MB batch index, and the
@@ -1426,7 +1428,7 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   std::size_t num_partitions,
   std::size_t user_buffer_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   rmm::device_uvector<size_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
 
@@ -1646,7 +1648,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
                                                        std::size_t num_partitions,
                                                        std::size_t user_buffer_size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* temp_mr)
+                                                       rmm::device_async_resource_ref temp_mr)
 {
   // Since we parallelize at one block per copy, performance is vulnerable to situations where we
   // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
@@ -1769,8 +1771,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr)
   {
   }
@@ -1778,8 +1780,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::vector<size_type> const& splits,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, splits, 0, stream, mr, temp_mr)
   {
   }
@@ -1897,8 +1899,8 @@ struct contiguous_split_state {
                          std::vector<size_type> const& splits,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : input(input),
       user_buffer_size(user_buffer_size),
       stream(stream),
@@ -1936,7 +1938,8 @@ struct contiguous_split_state {
       std::transform(h_buf_sizes,
                      h_buf_sizes + num_partitions,
                      std::back_inserter(out_buffers),
-                     [stream = stream, mr = mr](std::size_t bytes) {
+                     [stream = stream,
+                      mr = mr.value_or(rmm::mr::get_current_device_resource())](std::size_t bytes) {
                        return rmm::device_buffer{bytes, stream, mr};
                      });
     }
@@ -2014,11 +2017,11 @@ struct contiguous_split_state {
   cudf::table_view const input;        ///< The input table_view to operate on
   std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
   rmm::cuda_stream_view const stream;
-  rmm::mr::device_memory_resource* const mr;  ///< The memory resource for any data returned
+  std::optional<rmm::device_async_resource_ref const> mr;  ///< The resource for any data returned
 
   // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
   // `chunked_pack` case to allocate scratch/temp memory in a pool
-  rmm::mr::device_memory_resource* const temp_mr;  ///< The memory resource for scratch/temp space
+  rmm::device_async_resource_ref const temp_mr;  ///< The memory resource for scratch/temp space
 
   // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
   bool const is_empty;  ///< True if the source table has 0 rows or 0 columns
@@ -2062,7 +2065,7 @@ struct contiguous_split_state {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // `temp_mr` is the same as `mr` for contiguous_split as it allocates all
   // of its memory from the default memory resource in cuDF
@@ -2075,7 +2078,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
@@ -2083,14 +2086,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 chunked_pack::chunked_pack(cudf::table_view const& input,
                            std::size_t user_buffer_size,
-                           rmm::mr::device_memory_resource* temp_mr)
+                           rmm::device_async_resource_ref temp_mr)
 {
   CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
-  // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate
+  // We pass `std::nullopt` for the first `mr` in `contiguous_split_state` to indicate
   // that it does not allocate any user-bound data for the `chunked_pack` case.
   state = std::make_unique<detail::contiguous_split_state>(
-    input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
+    input, user_buffer_size, cudf::get_default_stream(), std::nullopt, temp_mr);
 }
 
 // required for the unique_ptr to work with a incomplete type (contiguous_split_state)
@@ -2115,7 +2118,7 @@ std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
 
 std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
                                                    std::size_t user_buffer_size,
-                                                   rmm::mr::device_memory_resource* temp_mr)
+                                                   rmm::device_async_resource_ref temp_mr)
 {
   return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
 }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 490a1ccb254..98ee6aa8f68 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,10 +120,11 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
+  CUDF_EXPECTS(
+    is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
   mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
 
   return std::make_unique<column>(input.type(),
@@ -176,7 +178,7 @@ std::unique_ptr<table> empty_like(table_view const& input_table)
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, input.size(), mask_alloc, stream, mr);
@@ -186,7 +188,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, size, mask_alloc, stream, mr);
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 8299c211fad..e86a1f8d6f1 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -26,10 +26,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -77,7 +79,7 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto p_lhs      = get_iterable_device_view{}(lhs_h, stream);
     auto p_rhs      = get_iterable_device_view{}(rhs_h, stream);
@@ -110,7 +112,7 @@ struct copy_if_else_functor_impl<string_view> {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using T = string_view;
 
@@ -162,7 +164,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto gather_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const gather_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -196,7 +198,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto scatter_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -225,7 +227,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return scatter_gather_based_if_else(rhs, lhs, size, logical_not{is_left}, stream, mr);
 }
@@ -236,7 +238,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto rhs_col = cudf::make_column_from_scalar(rhs, size, stream, mr);
   return scatter_gather_based_if_else(lhs, rhs_col->view(), size, is_left, stream, mr);
@@ -252,7 +254,7 @@ struct copy_if_else_functor_impl<struct_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -268,7 +270,7 @@ struct copy_if_else_functor_impl<list_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -284,7 +286,7 @@ struct copy_if_else_functor_impl<dictionary32> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -303,7 +305,7 @@ struct copy_if_else_functor {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
@@ -318,7 +320,7 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      bool right_nullable,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8",
@@ -356,14 +358,15 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
                std::invalid_argument);
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size", std::invalid_argument);
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    lhs.size() == rhs.size(), "Both columns must be of the same size", std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -372,16 +375,13 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
                std::invalid_argument);
-
-  auto rhs_type =
-    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
   CUDF_EXPECTS(
-    lhs.type() == rhs_type, "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(rhs, lhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -390,16 +390,13 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
                std::invalid_argument);
-
-  auto lhs_type =
-    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
   CUDF_EXPECTS(
-    lhs_type == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
@@ -408,10 +405,10 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
@@ -422,7 +419,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -432,7 +429,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -442,7 +439,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -452,7 +449,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 038646d8cf4..dd18f99a3c8 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -32,8 +32,10 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -98,7 +100,7 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_end,
     cudf::size_type target_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -119,7 +121,7 @@ struct out_of_place_copy_range_dispatch {
   std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
-    CUDF_FAIL("Unsupported type for out of place copy.");
+    CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error);
   }
 };
 
@@ -129,7 +131,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::strings::detail::copy_range(
     source, target, source_begin, source_end, target_begin, stream, mr);
@@ -141,13 +143,14 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
   cudf::dictionary_column_view const dict_target(target);
-  CUDF_EXPECTS(dict_source.keys().type() == dict_target.keys().type(),
-               "dictionary keys must be the same type");
+  CUDF_EXPECTS(cudf::have_same_types(dict_source.keys(), dict_target.keys()),
+               "dictionary keys must be the same type",
+               cudf::data_type_error);
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
@@ -210,7 +213,7 @@ void copy_range_in_place(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
   CUDF_EXPECTS(target.nullable() || not source.has_nulls(),
                "target should be nullable if source has null values.",
                std::invalid_argument);
@@ -231,14 +234,14 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) &&
                  (source_begin <= source_end) && (target_begin >= 0) &&
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
 
   return cudf::type_dispatcher<dispatch_storage_type>(
     target.type(),
@@ -270,7 +273,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range(source, target, source_begin, source_end, target_begin, stream, mr);
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 78748e5a00b..5eb039419df 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
@@ -39,7 +40,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not gather_map.has_nulls(), "gather_map contains nulls", std::invalid_argument);
 
@@ -66,7 +67,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "gather map size exceeds the column size limit",
@@ -85,7 +86,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               column_view const& gather_map,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 2e804415439..b8860da479c 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <stdexcept>
 
@@ -42,7 +43,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -65,7 +66,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -89,7 +90,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -124,7 +125,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -148,7 +149,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using Type = typename T::rep;
 
@@ -178,7 +179,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
@@ -193,7 +194,7 @@ struct get_element_functor {
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds", std::out_of_range);
   return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<scalar> get_element(column_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_element(input, index, stream, mr);
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index e4de4a43b68..b0208a58896 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -144,7 +145,7 @@ void build_column_metadata(metadata_builder& mb,
  */
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   // do a contiguous_split with no splits to get the memory for the table
   // arranged as we want it
@@ -260,7 +261,7 @@ void metadata_builder::clear() { return impl->clear(); }
 /**
  * @copydoc cudf::pack
  */
-packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+packed_columns pack(cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 620a03d8be5..d69d214a881 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -18,6 +18,8 @@
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -87,7 +89,7 @@ bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view st
 
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // If not compound types (LIST/STRING/STRUCT/DICTIONARY) then just copy the input into output.
   if (!cudf::is_compound(input.type())) { return std::make_unique<column>(input, stream, mr); }
@@ -132,11 +134,11 @@ bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream)
 }
 
 /**
- * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::device_async_resource_ref)
  */
 std::unique_ptr<cudf::column> purge_nonempty_nulls(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::purge_nonempty_nulls(input, stream, mr);
 }
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index 78d1b54882c..d3d42e35e26 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -26,6 +26,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -37,7 +38,7 @@ namespace cudf {
 namespace detail {
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   size_type num_rows = source_table.num_rows();
   auto elements      = make_counting_transform_iterator(
@@ -51,7 +52,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return std::move(
     cudf::detail::reverse(table_view({source_column}), stream, mr)->release().front());
@@ -60,7 +61,7 @@ std::unique_ptr<column> reverse(column_view const& source_column,
 
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_table, stream, mr);
@@ -68,7 +69,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_column, stream, mr);
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 0211f97deb3..f8e3a9a83e3 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= 0, "expected number of samples should be non-negative");
   auto const num_rows = input.num_rows();
@@ -92,7 +93,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sample(input, n, replacement, seed, stream, mr);
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 7931df4c9f0..993ee074f14 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -32,8 +32,11 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/count.h>
@@ -77,7 +80,7 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper<scalar const> const&
                                     size_type num_scatter_rows,
                                     column& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   size_type const grid_size      = grid_1d(num_scatter_rows, block_size).num_blocks;
@@ -109,9 +112,9 @@ struct column_scalar_scatterer_impl {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -142,9 +145,11 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
 
     auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
@@ -164,8 +169,11 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
     auto result =
       lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
 
@@ -181,7 +189,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto dict_target =
       dictionary::detail::add_keys(dictionary_column_view(target),
@@ -233,7 +241,7 @@ struct column_scalar_scatterer {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scalar_scatterer_impl<Element, MapIterator> scatterer{};
     return scatterer(source, scatter_iter, scatter_rows, target, stream, mr);
@@ -247,8 +255,12 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
+
     // For each field of `source`, copy construct a scalar from the field
     // and dispatch to the corresponding scalar scatterer
 
@@ -297,7 +309,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.num_columns() == target.num_columns(),
                "Number of columns in source and target not equal",
@@ -305,12 +317,7 @@ std::unique_ptr<table> scatter(table_view const& source,
   CUDF_EXPECTS(scatter_map.size() <= source.num_rows(),
                "Size of scatter map must be equal to or less than source rows",
                std::invalid_argument);
-  CUDF_EXPECTS(std::equal(source.begin(),
-                          source.end(),
-                          target.begin(),
-                          [](auto const& col1, auto const& col2) {
-                            return col1.type().id() == col2.type().id();
-                          }),
+  CUDF_EXPECTS(cudf::have_same_types(source, target),
                "Column types do not match between source and target",
                cudf::data_type_error);
   CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls", std::invalid_argument);
@@ -327,7 +334,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "scatter map size exceeds the column size limit",
@@ -344,7 +351,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.size() == static_cast<size_t>(target.num_columns()),
                "Number of scalars in source and number of columns in target not equal",
@@ -396,7 +403,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto indices = cudf::make_numeric_column(
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
@@ -421,7 +428,7 @@ std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::copy_if_else(input, target, boolean_mask, stream, mr);
 }
@@ -430,7 +437,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() == target.num_columns(),
                "Mismatch in number of input columns and target columns",
@@ -442,14 +449,9 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                "Mask must be of Boolean type",
                cudf::data_type_error);
   // Count valid pair of input and columns as per type at each column index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return ((input.column(index).type().id()) == (target.column(index).type().id()));
-                }),
-    "Type mismatch in input column and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(input, target),
+               "Type mismatch in input column and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
@@ -473,7 +475,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_type>(input.size()) == target.num_columns(),
                "Mismatch in number of scalars and target columns",
@@ -486,14 +488,13 @@ std::unique_ptr<table> boolean_mask_scatter(
                cudf::data_type_error);
 
   // Count valid pair of input and columns as per type at each column/scalar index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return (input[index].get().type().id() == target.column(index).type().id());
-                }),
-    "Type mismatch in input scalar and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(std::all_of(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(target.num_columns()),
+                           [&input, &target](auto index) {
+                             return cudf::have_same_types(target.column(index), input[index].get());
+                           }),
+               "Type mismatch in input scalar and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
@@ -518,7 +519,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, scatter_map, target, stream, mr);
@@ -528,7 +529,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, indices, target, stream, mr);
@@ -538,7 +539,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
@@ -549,7 +550,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index dd2733cf7e9..b7abc60f240 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -73,7 +74,7 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
@@ -102,7 +103,7 @@ struct segmented_shift_functor<string_view> {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     auto input_iterator     = make_optional_iterator<cudf::string_view>(
@@ -129,7 +130,7 @@ struct segmented_shift_functor_forwarder {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     segmented_shift_functor<T> shifter;
     return shifter(segmented_values, segment_offsets, offset, fill_value, stream, mr);
@@ -143,7 +144,7 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (segmented_values.is_empty()) { return empty_like(segmented_values); }
   if (offset == 0) { return std::make_unique<column>(segmented_values, stream, mr); };
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 8e013bb1212..91254f21170 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -26,10 +26,12 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -51,7 +53,7 @@ std::pair<rmm::device_buffer, size_type> create_null_mask(column_device_view con
                                                           size_type offset,
                                                           scalar const& fill_value,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto const size = input.size();
   auto func_validity =
@@ -81,7 +83,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto output = cudf::strings::detail::shift(
       cudf::strings_column_view(input), offset, fill_value, stream, mr);
@@ -101,7 +103,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ScalarType = cudf::scalar_type_t<T>;
     auto& scalar     = static_cast<ScalarType const&>(fill_value);
@@ -155,9 +157,9 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == fill_value.type(),
+  CUDF_EXPECTS(cudf::have_same_types(input, fill_value),
                "shift requires each fill value type to match the corresponding column type.",
                cudf::data_type_error);
 
@@ -173,7 +175,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::shift(input, offset, fill_value, stream, mr);
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 371663c41ee..7629cad79a9 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
@@ -254,7 +255,7 @@ struct dispatch_round {
     rounding_frequency component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = column.size();
     auto output_col_type = data_type{cudf::type_to_id<Timestamp>()};
@@ -319,7 +320,7 @@ struct launch_functor {
 template <typename TransformFunctor, cudf::type_id OutputColCudfT>
 std::unique_ptr<column> apply_datetime_op(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp");
   auto size            = column.size();
@@ -355,7 +356,7 @@ struct add_calendrical_months_functor {
     column_view timestamp_column,
     MonthIterator months_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = timestamp_column.size();
     auto output_col_type = timestamp_column.type();
@@ -386,7 +387,7 @@ struct add_calendrical_months_functor {
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                column_view const& months_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(
@@ -413,7 +414,7 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                scalar const& months,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(months.type().id() == type_id::INT16 or months.type().id() == type_id::INT32,
@@ -442,7 +443,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
                                       rounding_frequency component,
                                       column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
@@ -450,7 +451,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
 
 std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::YEAR>,
@@ -459,7 +460,7 @@ std::unique_ptr<column> extract_year(column_view const& column,
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MONTH>,
@@ -468,7 +469,7 @@ std::unique_ptr<column> extract_month(column_view const& column,
 
 std::unique_ptr<column> extract_day(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::DAY>,
@@ -477,7 +478,7 @@ std::unique_ptr<column> extract_day(column_view const& column,
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::WEEKDAY>,
@@ -486,7 +487,7 @@ std::unique_ptr<column> extract_weekday(column_view const& column,
 
 std::unique_ptr<column> extract_hour(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::HOUR>,
@@ -495,7 +496,7 @@ std::unique_ptr<column> extract_hour(column_view const& column,
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MINUTE>,
@@ -504,7 +505,7 @@ std::unique_ptr<column> extract_minute(column_view const& column,
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::SECOND>,
@@ -513,7 +514,7 @@ std::unique_ptr<column> extract_second(column_view const& column,
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
@@ -522,7 +523,7 @@ std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
@@ -531,7 +532,7 @@ std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
@@ -540,7 +541,7 @@ std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_last_day_of_month,
                                    cudf::type_id::TIMESTAMP_DAYS>(column, stream, mr);
@@ -548,7 +549,7 @@ std::unique_ptr<column> last_day_of_month(column_view const& column,
 
 std::unique_ptr<column> day_of_year(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_day_num_of_year, cudf::type_id::INT16>(
     column, stream, mr);
@@ -556,21 +557,21 @@ std::unique_ptr<column> day_of_year(column_view const& column,
 
 std::unique_ptr<column> is_leap_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
 }
 
 std::unique_ptr<column> days_in_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<days_in_month_op, type_id::INT16>(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<extract_quarter_op, type_id::INT16>(column, stream, mr);
 }
@@ -579,7 +580,7 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -588,7 +589,7 @@ std::unique_ptr<column> ceil_datetimes(column_view const& column,
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -597,88 +598,85 @@ std::unique_ptr<column> floor_datetimes(column_view const& column,
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
     detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_day(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_weekday(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_hour(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_minute(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_second(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::day_of_year(column, cudf::get_default_stream(), mr);
@@ -686,7 +684,7 @@ std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_m
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(
@@ -695,27 +693,26 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_leap_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::days_in_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_quarter(column, cudf::get_default_stream(), mr);
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a75eea7172f..a3471485293 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -19,6 +19,8 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 #include <filesystem>
 #include <fstream>
@@ -379,7 +381,7 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::make_timezone_transition_table(
@@ -391,7 +393,7 @@ namespace detail {
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 3973100aced..0ed9006f88b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -49,11 +52,12 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
-  CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
+  CUDF_EXPECTS(
+    cudf::have_same_types(new_keys, old_keys), "Keys must be the same type", cudf::data_type_error);
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
@@ -131,7 +135,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_keys(dictionary_column, keys, stream, mr);
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 8ce741c4a91..9f05593fc40 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -46,7 +47,7 @@ struct indices_handler_fn {
  */
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (source.is_empty()) return make_empty_column(type_id::EMPTY);
 
@@ -77,7 +78,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::decode(source, stream, mr);
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 17295fb0345..fdc3d9d0ecf 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -26,11 +26,14 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -81,13 +84,13 @@ struct compute_children_offsets_fn {
   }
 
   /**
-   * @brief Return the first keys().type of the dictionary columns.
+   * @brief Return the first keys() of the dictionary columns.
    */
-  data_type get_keys_type()
+  column_view get_keys()
   {
     auto const view(*std::find_if(
       columns_ptrs.begin(), columns_ptrs.end(), [](auto pcv) { return pcv->size() > 0; }));
-    return dictionary_column_view(*view).keys().type();
+    return dictionary_column_view(*view).keys();
   }
 
   /**
@@ -140,7 +143,7 @@ struct dispatch_compute_indices {
              offsets_pair const* d_offsets,
              size_type const* d_map_to_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto keys_view     = column_device_view::create(all_keys, stream);
     auto indices_view  = column_device_view::create(all_indices, stream);
@@ -206,21 +209,23 @@ struct dispatch_compute_indices {
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // exception here is the same behavior as in cudf::concatenate
   CUDF_EXPECTS(not columns.empty(), "Unexpected empty list of columns to concatenate.");
 
   // concatenate the keys (and check the keys match)
   compute_children_offsets_fn child_offsets_fn{columns};
-  auto keys_type = child_offsets_fn.get_keys_type();
+  auto expected_keys = child_offsets_fn.get_keys();
   std::vector<column_view> keys_views(columns.size());
-  std::transform(columns.begin(), columns.end(), keys_views.begin(), [keys_type](auto cv) {
+  std::transform(columns.begin(), columns.end(), keys_views.begin(), [expected_keys](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     // empty column may not have keys so we create an empty column_view place-holder
-    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr, nullptr, 0};
+    if (dict_view.is_empty()) return column_view{expected_keys.type(), 0, nullptr, nullptr, 0};
     auto keys = dict_view.keys();
-    CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
+    CUDF_EXPECTS(cudf::have_same_types(keys, expected_keys),
+                 "key types of all dictionary columns must match",
+                 cudf::data_type_error);
     return keys;
   });
   auto all_keys =
@@ -274,7 +279,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // now recompute the indices values for the new keys_column;
   // the keys offsets (pair.first) are for mapping to the input keys
-  auto indices_column = type_dispatcher(keys_type,
+  auto indices_column = type_dispatcher(expected_keys.type(),
                                         dispatch_compute_indices{},
                                         all_keys->view(),     // old keys
                                         all_indices->view(),  // old indices
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index 2fe21680873..c65aa5d1101 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -36,7 +37,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const lcol_iter = cudf::detail::indexalator_factory::make_input_iterator(lcol.indices());
   auto const rcol_iter = cudf::detail::indexalator_factory::make_input_iterator(rcol.indices());
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index f70423a13a9..37f8fa7a05b 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -30,7 +31,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& indices,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
     column_view indices_view{
@@ -40,7 +41,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<!is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("indices must be an integer type.");
   }
@@ -50,7 +51,7 @@ struct dispatch_create_indices {
 std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
                                                column_view const& indices_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls");
   if (keys_column.is_empty()) return make_empty_column(type_id::DICTIONARY32);
@@ -117,7 +118,7 @@ struct make_unsigned_fn {
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
                                                std::unique_ptr<column> indices,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys->has_nulls(), "keys column must not have nulls");
 
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index c92b57f0cac..ff29d83b80a 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -41,7 +42,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
@@ -90,7 +91,7 @@ data_type get_indices_type_for_size(size_type keys_size)
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input_column, indices_type, stream, mr);
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 86b70f1119b..35387efa56b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +62,7 @@ template <typename KeysKeeper>
 std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_column,
                                        KeysKeeper keys_to_keep_fn,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const keys_view    = dictionary_column.keys();
   auto const indices_type = dictionary_column.indices().type();
@@ -150,11 +153,13 @@ std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
-  CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
+  CUDF_EXPECTS(cudf::have_same_types(keys_view, keys_to_remove),
+               "keys types must match",
+               cudf::data_type_error);
 
   // locate keys to remove by searching the keys column
   auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
@@ -166,7 +171,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // locate the keys to remove
   auto const keys_size     = dictionary_column.keys_size();
@@ -196,7 +201,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
@@ -204,7 +209,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_unused_keys(dictionary_column, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 7069993866c..bc17dfd4bab 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,11 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -52,7 +55,7 @@ template <typename ReplacementIter>
 std::unique_ptr<column> replace_indices(column_view const& input,
                                         ReplacementIter replacement_iter,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto const input_view = column_device_view::create(input, stream);
   auto const d_input    = *input_view;
@@ -74,16 +77,18 @@ std::unique_ptr<column> replace_indices(column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::column_view
- * const& rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const& rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
-  CUDF_EXPECTS(input.keys().type() == replacement.keys().type(), "keys must match");
+  CUDF_EXPECTS(cudf::have_same_types(input.keys(), replacement.keys()),
+               "keys must match",
+               cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
@@ -107,18 +112,20 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::scalar
- * const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
-  CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
+  CUDF_EXPECTS(cudf::have_same_types(input.parent(), replacement),
+               "keys must match scalar type",
+               cudf::data_type_error);
 
   // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index e35aded1984..231619836f9 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,11 +19,14 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -40,7 +43,7 @@ struct dispatch_scalar_index {
   std::unique_ptr<scalar> operator()(size_type index,
                                      bool is_valid,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return std::make_unique<numeric_scalar<IndexType>>(index, is_valid, stream, mr);
   }
@@ -69,12 +72,14 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
@@ -96,7 +101,7 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL(
       "dictionary, list_view, and struct_view columns cannot be the keys column of a dictionary");
@@ -111,12 +116,14 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
@@ -138,7 +145,7 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("dictionary, list_view, and struct_view columns cannot be the keys for a dictionary");
   }
@@ -149,7 +156,7 @@ struct find_insert_index_fn {
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -160,7 +167,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -175,7 +182,7 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_index(dictionary, key, stream, mr);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b49cf7850b1..08a33d40abe 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,12 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -61,7 +64,7 @@ struct dispatch_compute_indices {
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto dictionary_view = column_device_view::create(input.parent(), stream);
     auto dictionary_itr  = make_dictionary_iterator<Element>(*dictionary_view);
@@ -115,15 +118,15 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-//
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
-  CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
+  CUDF_EXPECTS(
+    cudf::have_same_types(keys, new_keys), "keys types must match", cudf::data_type_error);
 
   // copy the keys -- use cudf::distinct to make sure there are no duplicates,
   // then sort the results.
@@ -177,7 +180,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
@@ -191,7 +194,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 }
 
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   // Make a copy of all the column views from each table_view
   std::vector<std::vector<column_view>> updated_columns;
@@ -242,7 +245,7 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::set_keys(dictionary_column, keys, stream, mr);
@@ -251,7 +254,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::match_dictionaries(input, stream, mr);
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index 80badb7d566..3e6d693dde5 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     init.type(), calendrical_month_sequence_functor{}, size, init, months, stream, mr);
@@ -41,7 +42,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::calendrical_month_sequence(size, init, months, stream, mr);
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 42d1f7592ec..1fc9ed31c09 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,9 +33,11 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -107,9 +109,9 @@ struct out_of_place_fill_range_dispatch {
   std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
                                            cudf::size_type end,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+    CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
 
     if (end != begin) {  // otherwise no fill
@@ -134,9 +136,9 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   return cudf::strings::detail::fill(
@@ -148,11 +150,12 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
-  CUDF_EXPECTS(target.keys().type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(target.parent(), value), "Data type mismatch.", cudf::data_type_error);
 
   // if the scalar is invalid, then just copy the column and fill the null mask
   if (!value.is_valid(stream)) {
@@ -218,7 +221,8 @@ void fill_in_place(mutable_column_view& destination,
                "Range is out of bounds.");
   CUDF_EXPECTS(destination.nullable() || value.is_valid(stream),
                "destination should be nullable or value should be non-null.");
-  CUDF_EXPECTS(destination.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(destination, value), "Data type mismatch.", cudf::data_type_error);
 
   if (end != begin) {  // otherwise no-op
     cudf::type_dispatcher(
@@ -233,7 +237,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && (begin <= end), "Range is out of bounds.");
 
@@ -258,7 +262,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::fill(input, begin, end, value, stream, mr);
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 87cc0f21d0e..ff4005d9366 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -102,7 +103,7 @@ namespace detail {
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input_table.num_rows() == count.size(), "in and count must have equal size");
   CUDF_EXPECTS(not count.has_nulls(), "count cannot contain nulls");
@@ -131,7 +132,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
@@ -154,7 +155,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
@@ -163,7 +164,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 99a17f8b0e0..ee1745b8498 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/tabulate.h>
@@ -66,7 +68,7 @@ struct sequence_functor {
                                      scalar const& init,
                                      scalar const& step,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -92,7 +94,7 @@ struct sequence_functor {
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -125,9 +127,11 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(init, step),
+               "init and step must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric");
 
@@ -137,7 +141,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
@@ -151,7 +155,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, step, stream, mr);
@@ -160,7 +164,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, stream, mr);
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 09b85c74f08..82c3c08b501 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/groupby.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -31,7 +33,7 @@ template <typename RequestType>
 inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
                                                        cudf::detail::result_cache& cache,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   std::vector<aggregation_result> results(requests.size());
   std::unordered_map<std::pair<column_view, std::reference_wrapper<aggregation const>>,
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e3c021eb66a..e43dfcb4d98 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,9 +36,11 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -65,7 +67,7 @@ groupby::groupby(table_view const& keys,
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::dispatch_aggregation(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
@@ -193,7 +195,7 @@ void verify_valid_requests(host_span<RequestType const> requests)
 
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
-  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<aggregation_request const> requests, rmm::device_async_resource_ref mr)
 {
   return aggregate(requests, cudf::get_default_stream(), mr);
 }
@@ -202,7 +204,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -220,7 +222,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<scan_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<scan_request const> requests, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -236,7 +238,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
   return sort_scan(requests, cudf::get_default_stream(), mr);
 }
 
-groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
+groupby::groups groupby::get_groups(table_view values, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -262,7 +264,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
   table_view const& values,
   host_span<cudf::replace_policy const> replace_policies,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(_keys.num_rows() == values.num_rows(),
@@ -306,17 +308,20 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   table_view const& values,
   host_span<size_type const> offsets,
   std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
                "Mismatch number of fill_values and columns.");
-  CUDF_EXPECTS(
-    std::all_of(thrust::make_counting_iterator(0),
-                thrust::make_counting_iterator(values.num_columns()),
-                [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
-    "values and fill_value should have the same type.");
-
+  CUDF_EXPECTS(std::equal(values.begin(),
+                          values.end(),
+                          fill_values.cbegin(),
+                          fill_values.cend(),
+                          [](auto const& col, auto const& scalar) {
+                            return cudf::have_same_types(col, scalar.get());
+                          }),
+               "values and fill_value should have the same type.",
+               cudf::data_type_error);
   auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index acc1b087510..4f75ab19c66 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -44,6 +44,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
@@ -190,7 +191,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
  public:
   using cudf::detail::aggregation_finalizer::visit;
@@ -202,7 +203,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               SetType set,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
     : col(col),
       sparse_results(sparse_results),
       dense_results(dense_results),
@@ -398,7 +399,7 @@ void sparse_to_dense_results(table_view const& keys,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
     cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
@@ -551,7 +552,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                                bool const keys_have_nulls,
                                null_policy const include_null_keys,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const num_keys            = keys.num_rows();
   auto const null_keys_are_equal = null_equality::EQUAL;
@@ -654,7 +655,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::detail::result_cache cache(requests.size());
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 2d6f99de25a..ba59616babe 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -37,6 +37,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -797,7 +798,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index be36956b929..057085fe85d 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -42,7 +43,7 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr,
+                       rmm::device_async_resource_ref mr,
                        sorted keys_are_sorted = sorted::NO)
     : helper(helper),
       cache(cache),
@@ -98,8 +99,8 @@ struct store_result_functor {
   cudf::detail::result_cache& cache;  ///< cache of results to store into
   column_view const& values;          ///< Column of values to group and aggregate
 
-  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
+  rmm::cuda_stream_view stream;       ///< CUDA stream on which to execute kernels
+  rmm::device_async_resource_ref mr;  ///< Memory resource to allocate space for results
 
   sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index a9c098bcf61..a1d197b1307 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMAX>{},
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 53a514ac8a7..03243bef836 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMIN>{},
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index f95ad72f453..555c5d3ad41 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -50,7 +51,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_groups,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto values_device_view = column_device_view::create(values, stream);
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index 4389b833c33..152aa98a8b9 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -120,7 +121,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   static_assert(
@@ -181,7 +182,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   CUDF_EXPECTS(covariance.type().id() == type_id::FLOAT64, "Covariance result must be FLOAT64");
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 2f289c8c8a7..56a4943e272 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/adjacent_difference.h>
@@ -37,7 +38,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
@@ -80,7 +81,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
 
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index 2e8fd41d984..c076f21e1f8 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scan.h>
@@ -30,7 +31,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> count_scan(cudf::device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<column> result = make_fixed_width_column(
     data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 67c30adcd47..1000ec0d470 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
                                         std::optional<column_view> const& partial_counts,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be the same as that of group labels.",
@@ -89,7 +90,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
@@ -101,7 +102,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 70b05100fb0..77f33486284 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
@@ -88,7 +89,7 @@ struct m2_functor {
     column_view const& group_means,
     cudf::device_span<size_type const> group_labels,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using result_type = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
     auto result       = make_numeric_column(data_type(type_to_id<result_type>()),
@@ -133,7 +134,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 148188f5fdf..60b071c25ff 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 8679ab09df6..270059cfcad 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MAX>{},
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 2c72128dbfb..92cce1aa00e 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -30,7 +31,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::LIST,
                "Input to `group_merge_lists` must be a lists column.");
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index a580c9dac9d..4ad8fa5ff07 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -131,7 +132,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
                "Input to `group_merge_m2` must be a structs column.");
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 3939fc41b65..22aaf664168 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 7d2a88fb038..4ddc10a2e5a 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MIN>{},
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 694c052e42d..1bc1eef908c 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -49,7 +50,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index 1a5f1691d5b..de11e70719a 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -78,7 +79,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index c53362f2095..83ca1059325 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_product_scan.cu b/cpp/src/groupby/sort/group_product_scan.cu
index e1a615730dd..40c53ceeff1 100644
--- a/cpp/src/groupby/sort/group_product_scan.cu
+++ b/cpp/src/groupby/sort/group_product_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      cudf::device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::PRODUCT>{},
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a6bc2d5b38d..3156dfaadd0 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -90,7 +91,7 @@ struct quantiles_functor {
     device_span<double const> quantile,
     interpolation interpolation,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::QUANTILE>;
 
@@ -161,7 +162,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto dv_quantiles = cudf::detail::make_device_uvector_async(
     quantiles, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 5cf7844410e..0b65889f127 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -100,7 +101,7 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
                                        scan_operator scan_op,
                                        bool has_nulls,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const grouped_values_view = table_view{{grouped_values}};
   auto const comparator =
@@ -155,7 +156,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -176,7 +177,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<false>(
     grouped_values,
@@ -197,7 +198,7 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto ranks = make_fixed_width_column(
     data_type{type_to_id<size_type>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
@@ -218,7 +219,7 @@ std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto max_rank = max_rank_scan(grouped_values,
                                 value_order,
@@ -251,7 +252,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -272,7 +273,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE");
   auto ranks = make_fixed_width_column(
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 3aa79f226a3..5e76dc3135a 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise product
@@ -75,7 +76,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise minimum value
@@ -98,7 +99,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise maximum value
@@ -121,7 +122,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of maximum values.
@@ -146,7 +147,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of minimum values.
@@ -171,7 +172,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of non-null values in each group of
@@ -195,7 +196,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of values in each group of @p values
@@ -215,7 +216,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to compute histogram for each group in @p values.
  *
@@ -242,7 +243,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -266,7 +267,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise variance
@@ -296,7 +297,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise quantiles
@@ -326,7 +327,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of unique values in each group of
@@ -358,7 +359,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate nth values in each group of  @p values
@@ -393,7 +394,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to collect grouped values into a lists column
  *
@@ -418,7 +419,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped lists into one list.
@@ -441,7 +442,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped M2 values corresponding to the same key.
@@ -467,7 +468,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
@@ -494,7 +495,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
@@ -521,7 +522,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find correlation from covariance and standard deviation.
@@ -536,7 +537,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 49557164230..566507da230 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/replace.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   cudf::size_type size = grouped_value.size();
 
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index fd53046f7e2..6f2daae5f9d 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -40,7 +41,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative product
@@ -57,7 +58,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative minimum value
@@ -72,7 +73,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative maximum value
@@ -87,7 +88,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate cumulative number of values in each group
@@ -99,7 +100,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
  */
 std::unique_ptr<column> count_scan(device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise min rank value
@@ -118,7 +119,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise max rank value
@@ -128,14 +129,14 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       column_view const& value_order,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise first rank value
@@ -145,14 +146,14 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         column_view const& value_order,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise average rank value
@@ -162,14 +163,14 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           column_view const& value_order,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise dense rank value
@@ -186,7 +187,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Convert groupwise rank to groupwise percentage rank
@@ -209,7 +210,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 2ebc8ba7d5d..b360ba2c45d 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -34,6 +34,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -60,7 +61,7 @@ struct group_scan_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_scan_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -89,7 +90,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -145,7 +146,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
@@ -191,7 +192,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 42d4b654346..5e892710d3b 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -116,7 +117,7 @@ struct group_reduction_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_reduction_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -149,7 +150,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 
   {
     using SourceDType = device_storage_type_t<T>;
@@ -218,7 +219,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 30b6f67dffe..70f64186f21 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -104,7 +105,7 @@ struct var_functor {
     cudf::device_span<size_type const> group_labels,
     size_type ddof,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::VARIANCE>;
 
@@ -175,7 +176,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 0af7cb22159..316b6f395bb 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index 2efa1185899..01c4d0c2c4a 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::SUM>{},
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 45c232aa3aa..f211c61b3b7 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -35,6 +35,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -207,7 +208,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
   host_span<scan_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 1e6c7a9393f..4da1da089cd 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -248,7 +249,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   column_ptr values_sort_order =
     cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(stream), values}),
@@ -272,7 +273,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto gather_map = key_sort_order(stream);
 
@@ -287,7 +288,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
 }
 
 std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto idx_data = key_sort_order(stream).data<size_type>();
 
@@ -305,7 +306,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
 }
 
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return cudf::detail::gather(_keys,
                               key_sort_order(stream),
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
deleted file mode 100644
index 68e02ef3cf4..00000000000
--- a/cpp/src/hash/hashing.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_MD5): return md5(input, stream, mr);
-    default: CUDF_FAIL("Unsupported hash function.");
-  }
-}
-
-}  // namespace detail
-}  // namespace hashing
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return hashing::detail::hash(input, hash_function, seed, stream, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b34455905d9..0b559e8e86c 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -284,7 +285,7 @@ inline bool md5_leaf_type_check(data_type dt)
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     // Return the MD5 hash of a zero-length input.
@@ -308,7 +309,7 @@ std::unique_ptr<column> md5(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -321,7 +322,7 @@ std::unique_ptr<column> md5(table_view const& input,
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.num_rows()),
     [d_chars, device_input = *device_input] __device__(auto row_index) {
-      MD5Hasher hasher(d_chars + (row_index * digest_size));
+      MD5Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * digest_size));
       for (auto const& col : device_input) {
         if (col.is_valid(row_index)) {
           if (col.type().id() == type_id::LIST) {
@@ -349,7 +350,7 @@ std::unique_ptr<column> md5(table_view const& input,
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::md5(input, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 1fc469686e1..6c91532a193 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 
@@ -109,7 +110,7 @@ class murmur_device_row_hasher {
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output1 = make_numeric_column(
     data_type(type_id::UINT64), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
@@ -140,7 +141,7 @@ std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x64_128(input, seed, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu
index a6ab301a86e..eac72f5d995 100644
--- a/cpp/src/hash/murmurhash3_x86_32.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -62,7 +63,7 @@ std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x86_32(input, seed, stream, mr);
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
index 71253d279b9..f7609eb26af 100644
--- a/cpp/src/hash/sha1_hash.cu
+++ b/cpp/src/hash/sha1_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -62,7 +63,7 @@ struct SHA1Hash : HashBase<SHA1Hash> {
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA1Hash>(input, stream, mr);
 }
@@ -71,7 +72,7 @@ std::unique_ptr<column> sha1(table_view const& input,
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha1(input, stream, mr);
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
index 61480a78776..cf04504a489 100644
--- a/cpp/src/hash/sha224_hash.cu
+++ b/cpp/src/hash/sha224_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA224Hash : HashBase<SHA224Hash> {
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA224Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha224(table_view const& input,
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha224(input, stream, mr);
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
index b15cfe09d52..664913c0f4c 100644
--- a/cpp/src/hash/sha256_hash.cu
+++ b/cpp/src/hash/sha256_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA256Hash : HashBase<SHA256Hash> {
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA256Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha256(table_view const& input,
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha256(input, stream, mr);
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
index 3075d2c62f8..92192f501ec 100644
--- a/cpp/src/hash/sha384_hash.cu
+++ b/cpp/src/hash/sha384_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA384Hash : HashBase<SHA384Hash> {
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA384Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha384(table_view const& input,
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha384(input, stream, mr);
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
index d073cf1edca..244206aeeb9 100644
--- a/cpp/src/hash/sha512_hash.cu
+++ b/cpp/src/hash/sha512_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA512Hash : HashBase<SHA512Hash> {
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA512Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha512(table_view const& input,
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha512(input, stream, mr);
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 0a22ee34918..6976241057e 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -503,7 +504,7 @@ bool inline sha_leaf_type_check(data_type dt)
 template <typename Hasher>
 std::unique_ptr<column> sha_hash(table_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
@@ -517,7 +518,7 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   auto chars   = rmm::device_uvector<char>(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -525,19 +526,20 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     Hasher hasher(d_chars + (row_index * Hasher::digest_size));
-                     for (auto const& col : device_input) {
-                       if (col.is_valid(row_index)) {
-                         cudf::type_dispatcher<dispatch_storage_type>(
-                           col.type(), HasherDispatcher(&hasher, col), row_index);
-                       }
-                     }
-                     hasher.finalize();
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.num_rows()),
+    [d_chars, device_input = *device_input] __device__(auto row_index) {
+      Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * Hasher::digest_size));
+      for (auto const& col : device_input) {
+        if (col.is_valid(row_index)) {
+          cudf::type_dispatcher<dispatch_storage_type>(
+            col.type(), HasherDispatcher(&hasher, col), row_index);
+        }
+      }
+      hasher.finalize();
+    });
 
   return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }
diff --git a/cpp/src/hash/spark_murmurhash3_x86_32.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu
deleted file mode 100644
index c7992b4afa0..00000000000
--- a/cpp/src/hash/spark_murmurhash3_x86_32.cu
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/hash_functions.cuh>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tabulate.h>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-namespace {
-
-using spark_hash_value_type = int32_t;
-
-template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
-struct Spark_MurmurHash3_x86_32 {
-  using result_type = spark_hash_value_type;
-
-  constexpr Spark_MurmurHash3_x86_32() = default;
-  constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
-  {
-    return compute(key);
-  }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (auto i = tail_offset; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the sign when
-      // casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
-};
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<bool>::operator()(
-  bool const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int8_t>::operator()(
-  int8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint8_t>::operator()(
-  uint8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int16_t>::operator()(
-  int16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint16_t>::operator()(
-  uint16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<float>::operator()(
-  float const& key) const
-{
-  return compute<float>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<double>::operator()(
-  double const& key) const
-{
-  return compute<double>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
-  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
-  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
-  __int128_t const val               = key.value();
-  constexpr cudf::size_type key_size = sizeof(__int128_t);
-  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
-
-  // Small negative values start with 0xff..., small positive values start with 0x00...
-  bool const is_negative     = val < 0;
-  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
-
-  // If the value can be represented with a shorter than 16-byte integer, the
-  // leading bytes of the little-endian value are truncated and are not hashed.
-  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
-  auto const reverse_end   = thrust::reverse_iterator(data);
-  auto const first_nonzero_byte =
-    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
-      return v == zero_value;
-    }).base();
-  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
-  cudf::size_type length =
-    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
-
-  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
-  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
-  // preserve the sign bit, rather than leaving an "f" at the front which would
-  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
-  // is needed because the leftmost bit matches the sign bit. Similarly for
-  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
-  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
-
-  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
-  __int128_t big_endian_value = 0;
-  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
-  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
-  return compute_bytes(big_endian_data, length);
-}
-
-/**
- * @brief Computes the hash value of a row in the given table.
- *
- * This functor uses Spark conventions for Murmur hashing, which differs from
- * the Murmur implementation used in the rest of libcudf. These differences
- * include:
- * - Serially using the output hash as an input seed for the next item
- * - Ignorance of null values
- *
- * The serial use of hashes as seeds means that data of different nested types
- * can exhibit hash collisions. For example, a row of an integer column
- * containing a 1 will have the same hash as a lists column of integers
- * containing a list of [1] and a struct column of a single integer column
- * containing a struct of {1}.
- *
- * As a consequence of ignoring null values, inputs like [1], [1, null], and
- * [null, 1] have the same hash (an expected hash collision). This kind of
- * collision can also occur across a table of nullable columns and with nulls
- * in structs ({1, null} and {null, 1} have the same hash). The seed value (the
- * previous element's hash value) is returned as the hash if an element is
- * null.
- *
- * For additional differences such as special tail processing and decimal type
- * handling, refer to the Spark_MurmurHash3_x86_32 functor.
- *
- * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32.
- * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
- */
-template <template <typename> class hash_function, typename Nullate>
-class spark_murmur_device_row_hasher {
-  friend class cudf::experimental::row::hash::row_hasher;  ///< Allow row_hasher to access private
-                                                           ///< members.
-
- public:
-  /**
-   * @brief Return the hash value of a row in the given table.
-   *
-   * @param row_index The row index to compute the hash value of
-   * @return The hash value of the row
-   */
-  __device__ auto operator()(size_type row_index) const noexcept
-  {
-    return cudf::detail::accumulate(
-      _table.begin(),
-      _table.end(),
-      _seed,
-      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
-        return cudf::type_dispatcher(
-          column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
-      });
-  }
-
- private:
-  /**
-   * @brief Computes the hash value of an element in the given column.
-   *
-   * When the column is non-nested, this is a simple wrapper around the element_hasher.
-   * When the column is nested, this uses a seed value to serially compute each
-   * nested element, with the output hash becoming the seed for the next value.
-   * This requires constructing a new hash functor for each nested element,
-   * using the new seed from the previous element's hash. The hash of a null
-   * element is the input seed (the previous element's hash).
-   */
-  template <template <typename> class hash_fn>
-  class element_hasher_adapter {
-   public:
-    __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
-      : _check_nulls(check_nulls), _seed(seed)
-    {
-    }
-
-    using hash_functor = cudf::experimental::row::hash::element_hasher<hash_fn, Nullate>;
-
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      auto const hasher = hash_functor{_check_nulls, _seed, _seed};
-      return hasher.template operator()<T>(col, row_index);
-    }
-
-    template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      column_device_view curr_col = col.slice(row_index, 1);
-      while (curr_col.type().id() == type_id::STRUCT || curr_col.type().id() == type_id::LIST) {
-        if (curr_col.type().id() == type_id::STRUCT) {
-          if (curr_col.num_child_columns() == 0) { return _seed; }
-          // Non-empty structs are assumed to be decomposed and contain only one child
-          curr_col = cudf::detail::structs_column_device_view(curr_col).get_sliced_child(0);
-        } else if (curr_col.type().id() == type_id::LIST) {
-          curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child();
-        }
-      }
-
-      return cudf::detail::accumulate(
-        thrust::counting_iterator(0),
-        thrust::counting_iterator(curr_col.size()),
-        _seed,
-        [curr_col, nulls = this->_check_nulls] __device__(auto hash, auto element_index) {
-          auto const hasher = hash_functor{nulls, hash, hash};
-          return cudf::type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
-            curr_col.type(), hasher, curr_col, element_index);
-        });
-    }
-
-    Nullate const _check_nulls;  ///< Whether to check for nulls
-    uint32_t const _seed;        ///< The seed to use for hashing, also returned for null elements
-  };
-
-  CUDF_HOST_DEVICE spark_murmur_device_row_hasher(Nullate check_nulls,
-                                                  table_device_view t,
-                                                  uint32_t seed = DEFAULT_HASH_SEED) noexcept
-    : _check_nulls{check_nulls}, _table{t}, _seed(seed)
-  {
-    // Error out if passed an unsupported hash_function
-    static_assert(
-      std::is_base_of_v<Spark_MurmurHash3_x86_32<int>, hash_function<int>>,
-      "spark_murmur_device_row_hasher only supports the Spark_MurmurHash3_x86_32 hash function");
-  }
-
-  Nullate const _check_nulls;
-  table_device_view const _table;
-  uint32_t const _seed;
-};
-
-void check_hash_compatibility(table_view const& input)
-{
-  using column_checker_fn_t = std::function<void(column_view const&)>;
-
-  column_checker_fn_t check_column = [&](column_view const& c) {
-    if (c.type().id() == type_id::LIST) {
-      auto const& list_col = lists_column_view(c);
-      CUDF_EXPECTS(list_col.child().type().id() != type_id::STRUCT,
-                   "Cannot compute hash of a table with a LIST of STRUCT columns.");
-      check_column(list_col.child());
-    } else if (c.type().id() == type_id::STRUCT) {
-      for (auto child = c.child_begin(); child != c.child_end(); ++child) {
-        check_column(*child);
-      }
-    }
-  };
-
-  for (column_view const& c : input) {
-    check_column(c);
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  auto output = make_numeric_column(data_type(type_to_id<spark_hash_value_type>()),
-                                    input.num_rows(),
-                                    mask_state::UNALLOCATED,
-                                    stream,
-                                    mr);
-
-  // Return early if there's nothing to hash
-  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
-
-  // Lists of structs are not supported
-  check_hash_compatibility(input);
-
-  bool const nullable   = has_nested_nulls(input);
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
-  auto output_view      = output->mutable_view();
-
-  // Compute the hash value for each row
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    output_view.begin<spark_hash_value_type>(),
-    output_view.end<spark_hash_value_type>(),
-    row_hasher.device_hasher<Spark_MurmurHash3_x86_32, spark_murmur_device_row_hasher>(nullable,
-                                                                                       seed));
-
-  return output;
-}
-
-}  // namespace detail
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::spark_murmurhash3_x86_32(input, seed, stream, mr);
-}
-
-}  // namespace hashing
-}  // namespace cudf
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index e17bc134420..4366c12b453 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -298,7 +299,7 @@ class device_row_hasher {
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -327,7 +328,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::xxhash_64(input, seed, stream, mr);
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
new file mode 100644
index 00000000000..9bbdaa2c363
--- /dev/null
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief constants for buffer indexes of Arrow arrays
+ *
+ */
+static constexpr int validity_buffer_idx         = 0;
+static constexpr int fixed_width_data_buffer_idx = 1;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 9f36280930d..78ddd7f5ad5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -21,9 +21,11 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <dlpack/dlpack.h>
 
@@ -133,7 +135,7 @@ struct dltensor_context {
 namespace detail {
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(nullptr != managed_tensor, "managed_tensor is null");
   auto const& tensor = managed_tensor->dl_tensor;
@@ -219,7 +221,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
 
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
@@ -230,9 +232,9 @@ DLManagedTensor* to_dlpack(table_view const& input,
   DLDataType const dltype = data_type_to_DLDataType(type);
 
   // Ensure all columns are the same type
-  CUDF_EXPECTS(
-    std::all_of(input.begin(), input.end(), [type](auto const& col) { return col.type() == type; }),
-    "All columns required to have same data type");
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns required to have same data type",
+               cudf::data_type_error);
 
   // Ensure none of the columns have nulls
   CUDF_EXPECTS(
@@ -298,13 +300,13 @@ DLManagedTensor* to_dlpack(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr);
 }
 
-DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr)
+DLManagedTensor* to_dlpack(table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_dlpack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 2a524c773c0..f100ca0cc2b 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -100,7 +101,7 @@ struct dispatch_to_cudf_column {
    */
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
   {
     if (array.null_bitmap_data() == nullptr) {
       return std::make_unique<rmm::device_buffer>(0, stream, mr);
@@ -126,7 +127,7 @@ struct dispatch_to_cudf_column {
 
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   std::unique_ptr<column> operator()(
-    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow.");
   }
@@ -136,7 +137,7 @@ struct dispatch_to_cudf_column {
                                      data_type type,
                                      bool skip_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto data_buffer         = array.data()->buffers[1];
     size_type const num_rows = array.length();
@@ -186,7 +187,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using DeviceType = __int128_t;
 
@@ -230,12 +231,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
 }
 
 template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
-  arrow::Array const& array,
-  data_type,
-  bool skip_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(arrow::Array const& array,
+                                                                  data_type,
+                                                                  bool skip_mask,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto data_buffer = array.data()->buffers[1];
   // mask-to-bools expects the mask to be bitmask_type aligned/padded
@@ -273,7 +273,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (array.length() == 0) { return make_empty_column(type_id::STRING); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
@@ -311,7 +311,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
   auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
@@ -344,7 +344,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto struct_array = static_cast<arrow::StructArray const*>(&array);
   std::vector<std::unique_ptr<column>> child_columns;
@@ -377,7 +377,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto list_array   = static_cast<arrow::ListArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
@@ -412,7 +412,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
@@ -423,7 +423,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
   std::vector<std::unique_ptr<column>> columns;
@@ -464,7 +464,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
   if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
@@ -484,7 +484,7 @@ std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -493,7 +493,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
new file mode 100644
index 00000000000..d4d31d1989b
--- /dev/null
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+
+namespace detail {
+data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
+{
+  switch (arrow_view->type) {
+    case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
+    case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
+    case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+    case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+    case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+    case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
+    case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+    case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+    case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+    case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+    case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
+    case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
+    case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
+    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
+    case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
+    case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
+    case NANOARROW_TYPE_TIMESTAMP: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DURATION: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DECIMAL128:
+      return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
+    default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
+  }
+}
+
+namespace {
+
+using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;
+
+struct dispatch_from_arrow_device {
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
+                           !std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView*,
+                              ArrowArray const*,
+                              data_type,
+                              bool,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView* schema,
+                              ArrowArray const* input,
+                              data_type type,
+                              bool skip_mask,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    size_type const num_rows   = input->length;
+    size_type const offset     = input->offset;
+    size_type const null_count = input->null_count;
+    bitmask_type const* null_mask =
+      skip_mask ? nullptr
+                : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]);
+    auto data_buffer = input->buffers[fixed_width_data_buffer_idx];
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type, num_rows, data_buffer, null_mask, null_count, offset}, {});
+  }
+};
+
+// forward declaration is needed because `type_dispatch` instantiates the
+// dispatch_from_arrow_device struct causing a recursive situation for struct,
+// dictionary and list_view types.
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
+                                                              ArrowArray const* input,
+                                                              data_type type,
+                                                              bool skip_mask,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto out_col = mask_to_bools(
+    reinterpret_cast<bitmask_type const*>(input->buffers[fixed_width_data_buffer_idx]),
+    input->offset,
+    input->offset + input->length,
+    stream,
+    mr);
+  auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr;
+  if (has_nulls) {
+    auto out_mask = cudf::detail::copy_bitmask(
+      reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+      input->offset,
+      input->offset + input->length,
+      stream,
+      mr);
+    out_col->set_null_mask(std::move(out_mask), input->null_count);
+  }
+
+  auto out_view = out_col->view();
+  owned_columns_t owned;
+  owned.emplace_back(std::move(out_col));
+  return std::make_tuple<column_view, owned_columns_t>(std::move(out_view), std::move(owned));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto offsets_view = column_view{data_type(type_id::INT32),
+                                  static_cast<size_type>(input->offset + input->length) + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     static_cast<size_type>(input->length),
+     input->buffers[2],
+     skip_mask ? nullptr
+               : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     static_cast<size_type>(input->null_count),
+     static_cast<size_type>(input->offset),
+     {offsets_view}},
+    {});
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  ArrowSchemaView keys_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr));
+
+  auto const keys_type = arrow_to_cudf_type(&keys_schema_view);
+  auto [keys_view, owned_cols] =
+    get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
+
+  auto const dict_indices_type = [&schema]() -> data_type {
+    // cudf dictionary requires an unsigned type for the indices,
+    // since it is invalid for an arrow dictionary to contain negative
+    // indices, we can safely use the unsigned equivalent without having
+    // to modify the buffers.
+    switch (schema->storage_type) {
+      case NANOARROW_TYPE_INT8:
+      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+      case NANOARROW_TYPE_INT16:
+      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+      case NANOARROW_TYPE_INT32:
+      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+      case NANOARROW_TYPE_INT64:
+      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
+    }
+  }();
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  column_view indices_view   = column_view{dict_indices_type,
+                                         offset + num_rows,
+                                         input->buffers[fixed_width_data_buffer_idx],
+                                         nullptr,
+                                         0,
+                                         0};
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {indices_view, keys_view}},
+    std::move(owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  std::vector<column_view> children;
+  owned_columns_t out_owned_cols;
+  std::transform(
+    input->children,
+    input->children + input->n_children,
+    schema->schema->children,
+    std::back_inserter(children),
+    [&out_owned_cols, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (out_owned_cols.empty()) {
+        out_owned_cols = std::move(owned);
+      } else {
+        out_owned_cols.insert(std::end(out_owned_cols),
+                              std::make_move_iterator(std::begin(owned)),
+                              std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     std::move(children)},
+    std::move(out_owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  auto offsets_view          = column_view{data_type(type_id::INT32),
+                                  offset + num_rows + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+
+  ArrowSchemaView child_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&child_schema_view, schema->schema->children[0], nullptr));
+  auto child_type = arrow_to_cudf_type(&child_schema_view);
+  auto [child_view, owned] =
+    get_column(&child_schema_view, input->children[0], child_type, false, stream, mr);
+
+  // in the scenario where we were sliced and there are more elements in the child_view
+  // than can be referenced by the sliced offsets, we need to slice the child_view
+  // so that when `get_sliced_child` is called, we still produce the right result
+  auto max_child_offset = cudf::detail::get_value<int32_t>(offsets_view, offset + num_rows, stream);
+  child_view            = cudf::slice(child_view, {0, max_child_offset}, stream).front();
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     rmm::device_buffer{0, stream, mr}.data(),
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {offsets_view, child_view}},
+    std::move(owned));
+}
+
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  return type.id() != type_id::EMPTY
+           ? std::move(type_dispatcher(
+               type, dispatch_from_arrow_device{}, schema, input, type, skip_mask, stream, mr))
+           : std::make_tuple<column_view, owned_columns_t>({data_type(type_id::EMPTY),
+                                                            static_cast<size_type>(input->length),
+                                                            nullptr,
+                                                            nullptr,
+                                                            static_cast<size_type>(input->length)},
+                                                           {});
+}
+
+}  // namespace
+
+unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  std::vector<column_view> columns;
+  owned_columns_t owned_mem;
+
+  auto type = arrow_to_cudf_type(schema);
+  CUDF_EXPECTS(type == data_type(type_id::STRUCT),
+               "Must pass a struct to `from_arrow_device`",
+               cudf::data_type_error);
+  std::transform(
+    input->array.children,
+    input->array.children + input->array.n_children,
+    schema->schema->children,
+    std::back_inserter(columns),
+    [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (owned_mem.empty()) {
+        owned_mem = std::move(owned);
+      } else {
+        owned_mem.insert(std::end(owned_mem),
+                         std::make_move_iterator(std::begin(owned)),
+                         std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  return unique_table_view_t{new table_view{columns},
+                             custom_view_deleter<cudf::table_view>{std::move(owned_mem)}};
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  auto type             = arrow_to_cudf_type(schema);
+  auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr);
+  return unique_column_view_t{new column_view{colview},
+                              custom_view_deleter<cudf::column_view>{std::move(owned)}};
+}
+
+}  // namespace detail
+
+unique_table_view_t from_arrow_device(ArrowSchema const* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray memory must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device(&view, input, stream, mr);
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device_column(&view, input, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
new file mode 100644
index 00000000000..f2b1669df9b
--- /dev/null
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -0,0 +1,716 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+template <typename T>
+void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
+{
+  auto* unique_buffer = reinterpret_cast<std::unique_ptr<T>*>(allocator->private_data);
+  delete unique_buffer;
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
+template <typename>
+struct is_device_scalar : public std::false_type {};
+
+template <typename T>
+struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+
+template <typename>
+struct is_device_uvector : public std::false_type {};
+
+template <typename T>
+struct is_device_uvector<rmm::device_uvector<T>> : public std::true_type {};
+
+template <typename T>
+int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
+{
+  ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+  auto ptr         = reinterpret_cast<uint8_t*>(device_buf->data());
+  buf->size_bytes  = [&] {
+    if constexpr (is_device_scalar<T>::value) {
+      return sizeof(typename T::value_type);
+    } else if constexpr (is_device_uvector<T>::value) {
+      return sizeof(typename T::value_type) * device_buf->size();
+    } else {
+      return device_buf->size();
+    }
+  }();
+  // we make a new unique_ptr and move to it in case there was a custom deleter
+  NANOARROW_RETURN_NOT_OK(
+    ArrowBufferSetAllocator(buf,
+                            ArrowBufferDeallocator(&device_buffer_finalize<T>,
+                                                   new std::unique_ptr<T>(std::move(device_buf)))));
+  buf->data = ptr;
+  return NANOARROW_OK;
+}
+
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
+}
+
+struct dispatch_to_arrow_device {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&& column,
+                 rmm::cuda_stream_view stream,
+                 rmm::device_async_resource_ref mr,
+                 ArrowArray* out)
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    auto contents = column.release();
+    NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column::contents& contents, ArrowArray* out)
+  {
+    if (contents.null_mask) {
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.null_mask), validity_buffer_idx, out));
+    }
+    return NANOARROW_OK;
+  }
+
+  int set_contents(column::contents& contents, ArrowArray* out)
+  {
+    NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, out));
+    return NANOARROW_OK;
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(cudf::column_view input,
+                      rmm::cuda_stream_view stream,
+                      rmm::device_async_resource_ref mr,
+                      ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
+
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+  auto buf =
+    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
+
+  auto count = thrust::counting_iterator<size_type>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream, mr),
+    count,
+    count + input.size(),
+    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
+      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+      // the lowest order bits are the value, the remainder
+      // simply matches the sign bit to satisfy the two's
+      // complement integer representation of negative numbers.
+      out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+      }
+    });
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int32_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int64_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr,
+                                                              ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr,
+                                               ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // the scalar zero here is necessary because the spec for string arrays states
+    // that the offsets buffer should contain "length + 1" signed integers. So in
+    // the case of a 0 length string array, there should be exactly 1 value, zero,
+    // in the offsets buffer. While some arrow implementations may accept a zero-sized
+    // offsets buffer, best practices would be to allocate the buffer with the single value.
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
+
+  auto offsets_contents =
+    contents.children[cudf::strings_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::device_async_resource_ref mr,
+                                                          ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr,
+                                                             ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto& child           = contents.children[i];
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::device_async_resource_ref mr,
+                                                          ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
+
+  auto offsets_contents =
+    contents.children[cudf::lists_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+
+  auto& child = contents.children[cudf::lists_column_view::child_column_index];
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr,
+                                                             ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
+
+  auto indices_contents =
+    contents.children[cudf::dictionary_column_view::indices_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(indices_contents.data), fixed_width_data_buffer_idx, tmp.get()));
+
+  auto& keys = contents.children[cudf::dictionary_column_view::keys_column_index];
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys->type(), dispatch_to_arrow_device{}, std::move(*keys), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+struct dispatch_to_arrow_device_view {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+    NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+    NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_buffer_view(void const* in_ptr, size_t size, int64_t i, ArrowArray* out) const
+  {
+    ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+    buf->size_bytes  = size;
+
+    // reset the deallocator to do nothing since this is a non-owning view
+    NANOARROW_RETURN_NOT_OK(ArrowBufferSetAllocator(
+      buf, ArrowBufferDeallocator([](ArrowBufferAllocator*, uint8_t*, int64_t) {}, nullptr)));
+
+    buf->data = const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(in_ptr));
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column_view column, ArrowArray* out) const
+  {
+    if (column.nullable()) {
+      NANOARROW_RETURN_NOT_OK(set_buffer_view(column.null_mask(),
+                                              bitmask_allocation_size_bytes(column.size()),
+                                              validity_buffer_idx,
+                                              out));
+    }
+    return NANOARROW_OK;
+  }
+
+  int set_view_to_buffer(column_view column, ArrowArray* out) const
+  {
+    auto const type_size = cudf::size_of(column.type());
+    return set_buffer_view(column.head<uint8_t>() + (type_size * column.offset()),
+                           column.size() * type_size,
+                           fixed_width_data_buffer_idx,
+                           out);
+  }
+};
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
+{
+  using DeviceType = int32_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
+{
+  using DeviceType = int64_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const scv = cudf::strings_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(scv.offsets(), tmp.get()));
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer_view(scv.chars_begin(stream), scv.chars_size(stream), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = column.child(i);
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child.type(), dispatch_to_arrow_device_view{child, stream, mr}, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const lcv = cudf::lists_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(lcv.offsets(), tmp.get()));
+
+  auto child = lcv.child();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child.type(), dispatch_to_arrow_device_view{child, stream, mr}, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const dcv = cudf::dictionary_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(dcv.indices(), tmp.get()));
+
+  auto keys = dcv.keys();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys.type(), dispatch_to_arrow_device_view{keys, stream, mr}, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+struct ArrowDeviceArrayPrivateData {
+  ArrowArray parent;
+  cudaEvent_t sync_event;
+};
+
+void ArrowDeviceArrayRelease(ArrowArray* array)
+{
+  auto private_data = reinterpret_cast<ArrowDeviceArrayPrivateData*>(array->private_data);
+  RMM_ASSERT_CUDA_SUCCESS(cudaEventDestroy(private_data->sync_event));
+  ArrowArrayRelease(&private_data->parent);
+  delete private_data;
+  array->release = nullptr;
+}
+
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
+                                          rmm::cuda_stream_view stream)
+{
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(out.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  CUDF_CUDA_TRY(cudaEventCreate(&private_data->sync_event));
+  CUDF_CUDA_TRY(cudaEventRecord(private_data->sync_event, stream.value()));
+
+  ArrowArrayMove(out.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = private_data->sync_event;
+  result->array              = private_data->parent;  // makes a shallow copy
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
+}
+
+}  // namespace
+
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  auto cols = table.release();
+  for (size_t i = 0; i < cols.size(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = cols[i].get();
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child));
+  }
+
+  return create_device_array(std::move(tmp), stream);
+}
+
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
+
+  return create_device_array(std::move(tmp), stream);
+}
+
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, child));
+  }
+
+  return create_device_array(std::move(tmp), stream);
+}
+
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, tmp.get()));
+
+  return create_device_array(std::move(tmp), stream);
+}
+
+}  // namespace detail
+
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(table), stream, mr);
+}
+
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(col), stream, mr);
+}
+
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(table, stream, mr);
+}
+
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(col, stream, mr);
+}
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
new file mode 100644
index 00000000000..6f943593dce
--- /dev/null
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+struct dispatch_to_arrow_type {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(column_view, column_metadata const&, ArrowSchema*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_schema", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
+  {
+    cudf::type_id id = input_view.type().id();
+    switch (id) {
+      case cudf::type_id::TIMESTAMP_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::TIMESTAMP_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
+      case cudf::type_id::DURATION_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::DURATION_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::DURATION_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::DURATION_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
+      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
+    }
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(column_view input, ArrowSchema* out)
+{
+  // Arrow doesn't support decimal32/decimal64 currently. decimal128
+  // is the smallest that arrow supports besides float32/float64 so we
+  // upcast to decimal128.
+  return ArrowSchemaSetTypeDecimal(out,
+                                   NANOARROW_TYPE_DECIMAL128,
+                                   cudf::detail::max_precision<DeviceType>(),
+                                   -input.type().scale());
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
+                                                            column_metadata const&,
+                                                            ArrowSchema* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
+                                                          column_metadata const&,
+                                                          ArrowSchema* out)
+{
+  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+}
+
+// these forward declarations are needed due to the recursive calls to them
+// inside their definitions and in struct_vew for handling children
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
+                                                          column_metadata const& metadata,
+                                                          ArrowSchema* out)
+{
+  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children doesn't match\n");
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
+  for (int i = 0; i < input.num_children(); ++i) {
+    auto child = out->children[i];
+    auto col   = input.child(i);
+    ArrowSchemaInit(child);
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
+
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
+  }
+
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
+  auto child = input.child(cudf::lists_column_view::child_column_index);
+  ArrowSchemaInit(out->children[0]);
+  auto child_meta =
+    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+
+  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
+  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  return cudf::type_dispatcher(
+    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out)
+{
+  cudf::dictionary_column_view dview{input};
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
+  ArrowSchemaInit(out->dictionary);
+
+  auto dict_keys = dview.keys();
+  return cudf::type_dispatcher(
+    dict_keys.type(),
+    detail::dispatch_to_arrow_type{},
+    dict_keys,
+    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
+    out->dictionary);
+}
+}  // namespace
+}  // namespace detail
+
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata)
+{
+  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
+               "columns' metadata should be equal to the number of columns in table");
+
+  nanoarrow::UniqueSchema result;
+  ArrowSchemaInit(result.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
+
+  for (int i = 0; i < input.num_columns(); ++i) {
+    auto child = result->children[i];
+    auto col   = input.column(i);
+    ArrowSchemaInit(child);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
+  }
+
+  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
+    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
+    delete schema;
+  });
+  result.move(out.get());
+  return out;
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp
new file mode 100644
index 00000000000..04d17847273
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+namespace cudf {
+namespace detail {
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp
new file mode 100644
index 00000000000..3c01c726a7b
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <nanoarrow/nanoarrow_types.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Map cudf column type id to ArrowType id
+ *
+ * @param id Column type id
+ * @return ArrowType id
+ */
+ArrowType id_to_arrow_type(cudf::type_id id);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 03fd663040a..814efe2b5a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
@@ -373,7 +374,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
                                        std::vector<std::pair<int, std::string>> const& selection,
                                        std::vector<data_type> const& column_types,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto out_buffers = std::vector<column_buffer>();
 
@@ -483,7 +484,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto skip_rows = options.get_skip_rows();
   auto num_rows  = options.get_num_rows();
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index a116335b254..44535cff589 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
   s->origPtr = getbits(s, 24);
-  if (s->origPtr < 0 || s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
+  if (s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
 
   // Receive the mapping table
   inUse16 = getbits(s, 16);
@@ -436,7 +436,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   }
 
   // Now we know what nblock is, we can do a better sanity check on s->origPtr.
-  if (s->origPtr < 0 || s->origPtr >= nblock) return BZ_DATA_ERROR;
+  if (s->origPtr >= nblock) return BZ_DATA_ERROR;
 
   // compute the T^(-1) vector
   {
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index 2a9eb782800..faf967041bc 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -18,6 +18,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
 namespace cudf::io {
@@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
     rmm::exec_policy(stream),
     results.begin(),
     results.end(),
-    [] __device__(auto& res) {
+    cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
       return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
-    },
+    }),
     0ul,
     thrust::plus<size_t>());
 
@@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
       rmm::exec_policy(stream),
       zipped_begin,
       zipped_end,
-      [status] __device__(auto tup) {
+      cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
         return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
-      },
+      }),
       0ul,
       thrust::plus<size_t>());
   };
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 9c186f161b3..7a05d0aebaf 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
         if (dtypes[actual_col].id() == cudf::type_id::STRING) {
           auto end = next_delimiter;
           if (not options.keepquotes) {
-            if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
-              ++field_start;
-              --end;
+            if (not options.detect_whitespace_around_quotes) {
+              if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
+                ++field_start;
+                --end;
+              }
+            } else {
+              // If the string is quoted, whitespace around the quotes get removed as well
+              auto const trimmed_field = trim_whitespaces(field_start, end);
+              if ((*trimmed_field.first == options.quotechar) &&
+                  (*(trimmed_field.second - 1) == options.quotechar)) {
+                field_start = trimmed_field.first + 1;
+                end         = trimmed_field.second - 1;
+              }
             }
           }
           auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 76b1b46dc61..918951d5902 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -174,7 +175,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& durations,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = durations.size();
     auto column             = column_device_view::create(durations, stream);
@@ -211,7 +212,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<not cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_durations function must be a duration type.");
   }
@@ -221,7 +222,7 @@ struct dispatch_from_durations_fn {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index ac925011c58..f671f435eeb 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -30,7 +31,7 @@ namespace csv {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 02daf4655db..5dee0c17a33 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -574,7 +575,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
                                        int32_t num_actual_columns,
                                        int32_t num_active_columns,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -667,7 +668,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
                              csv_reader_options const& reader_opts,
                              parse_options const& parse_opts,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   std::vector<char> header;
 
@@ -950,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   parse_opts.terminator = reader_opts.get_lineterminator();
 
   if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
-    parse_opts.quotechar   = reader_opts.get_quotechar();
-    parse_opts.keepquotes  = false;
+    parse_opts.quotechar  = reader_opts.get_quotechar();
+    parse_opts.keepquotes = false;
+    parse_opts.detect_whitespace_around_quotes =
+      reader_opts.is_enabled_detect_whitespace_around_quotes();
     parse_opts.doublequote = reader_opts.is_enabled_doublequote();
   } else {
     parse_opts.quotechar   = '\0';
@@ -995,7 +998,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto parse_options = make_parse_options(options, stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index c143d258448..7c4d5711281 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -41,6 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
@@ -74,8 +75,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -88,7 +90,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -114,7 +116,7 @@ struct escape_strings_fn {
     }
     if (quote_row) write_char(quote, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -140,7 +142,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(csv_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options), stream_(stream), mr_(mr)
   {
   }
@@ -277,7 +279,7 @@ struct column_to_strings_fn {
  private:
   csv_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
 };
 }  // unnamed namespace
 
@@ -288,7 +290,7 @@ void write_chunked_begin(data_sink* out_sink,
                          host_span<std::string const> user_column_names,
                          csv_writer_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
 {
   if (options.is_enabled_include_header()) {
     // need to generate column names if names are not provided
@@ -354,7 +356,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    csv_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   // algorithm outline:
   //
@@ -410,7 +412,7 @@ void write_csv(data_sink* out_sink,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b8353d312fe..3ba2facf276 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -36,6 +36,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 
 namespace cudf::io {
@@ -156,8 +158,7 @@ std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& options,
-                              rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options, rmm::device_async_resource_ref mr)
 {
   namespace avro = cudf::io::detail::avro;
 
@@ -201,7 +202,7 @@ compression_type infer_compression_type(compression_type compression, source_inf
 
 table_with_metadata read_json(json_reader_options options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -216,7 +217,7 @@ table_with_metadata read_json(json_reader_options options,
 
 void write_json(json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -231,7 +232,7 @@ void write_json(json_writer_options const& options,
 
 table_with_metadata read_csv(csv_reader_options options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -253,7 +254,7 @@ table_with_metadata read_csv(csv_reader_options options,
 // Freeform API wraps the detail writer class API
 void write_csv(csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::detail;
 
@@ -413,13 +414,13 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie
  */
 table_with_metadata read_orc(orc_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
   auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
-  return reader->read(options);
+  return reader->read();
 }
 
 /**
@@ -436,16 +437,65 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 
   auto writer = std::make_unique<orc::detail::writer>(
     std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream);
-  try {
-    writer->write(options.get_table());
-  } catch (...) {
-    // If an exception is thrown, the output is incomplete/corrupted.
-    // Make sure the writer will not close with such corrupted data.
-    // In addition, the writer may throw an exception while trying to close, which would terminate
-    // the process.
-    writer->skip_close();
-    throw;
-  }
+  writer->write(options.get_table());
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       size_type output_row_granularity,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         output_row_granularity,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr)
+{
+}
+
+// This destructor destroys the internal reader instance.
+// Since the declaration of the internal `reader` object does not exist in the header, this
+// destructor needs to be defined in a separate source file which can access to that object's
+// declaration.
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
 }
 
 /**
@@ -490,7 +540,7 @@ namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -498,7 +548,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   auto reader =
     std::make_unique<detail_parquet::reader>(std::move(datasources), options, stream, mr);
 
-  return reader->read(options);
+  return reader->read();
 }
 
 parquet_metadata read_parquet_metadata(source_info const& src_info)
@@ -542,6 +592,8 @@ table_input_metadata::table_input_metadata(table_metadata const& metadata)
     [&](column_name_info const& name) {
       auto col_meta = column_in_metadata{name.name};
       if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
+      if (name.is_binary.value_or(false)) { col_meta.set_output_as_binary(true); }
+      if (name.type_length.has_value()) { col_meta.set_type_length(name.type_length.value()); }
       std::transform(name.children.begin(),
                      name.children.end(),
                      std::back_inserter(col_meta.children),
@@ -578,7 +630,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(
       chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)}
 {
@@ -591,7 +643,7 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                std::size_t pass_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
                                                             pass_read_limit,
                                                             make_datasources(options.get_source()),
@@ -809,6 +861,13 @@ parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers
   return *this;
 }
 
+parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 void chunked_parquet_writer_options::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
@@ -897,6 +956,13 @@ chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::
   return *this;
 }
 
+chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 chunked_parquet_writer_options_builder&
 chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
 {
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bc5c45d8980..631f8adbd6d 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -76,16 +77,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -333,10 +334,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
  * @param stream CUDA stream
  * @return Vector of strings
  */
-std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
-                                              device_span<SymbolOffsetT const> node_range_begin,
-                                              device_span<SymbolOffsetT const> node_range_end,
-                                              rmm::cuda_stream_view stream)
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
@@ -371,12 +373,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    auto const h_chars = cudf::detail::make_std_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_sync(
+    auto const h_offsets = cudf::detail::make_std_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
+    stream.synchronize();
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -479,7 +482,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -528,8 +531,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto column_range_beg =
     cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host(
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -537,6 +541,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
       cudf::detail::make_std_vector_async(values_column_indices, stream);
+    stream.synchronize();
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -559,7 +564,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
   };
   auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0);
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
   auto initialize_json_columns = [&](auto i, auto& col) {
@@ -609,7 +614,7 @@ void make_device_json_column(device_span<SymbolT const> input,
 
   std::vector<uint8_t> is_str_column_all_nulls{};
   if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_async(
+    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
       is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
   }
 
@@ -620,13 +625,14 @@ void make_device_json_column(device_span<SymbolT const> input,
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
@@ -642,11 +648,46 @@ void make_device_json_column(device_span<SymbolT const> input,
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
+    return std::pair{name, parent_col_id};
+  };
 
-    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
-      // if parent is mixed type column, ignore this column.
-      is_mixed_type_column[this_col_id] = 1;
-      ignore_vals[this_col_id]          = 1;
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+
+    // if parent is mixed type column or this column is pruned, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       continue;
     }
 
@@ -709,12 +750,13 @@ void make_device_json_column(device_span<SymbolT const> input,
                      "A mix of lists and structs within the same column is not supported");
       }
     }
+
     if (is_enabled_mixed_types_as_string) {
       // get path of this column, check if it is a struct forced as string, and enforce it
-      auto nt                          = tree_path.get_path(this_col_id);
-      std::optional<data_type> user_dt = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
-          user_dt.value().id() == type_id::STRING) {
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
+          user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
         column_categories[this_col_id]    = NC_STR;
       }
@@ -868,28 +910,30 @@ void make_device_json_column(device_span<SymbolT const> input,
   for (auto& [id, col_ref] : columns) {
     auto& col = col_ref.get();
     if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.string_offsets.begin(),
                              col.string_offsets.end(),
                              col.string_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.child_offsets.begin(),
                              col.child_offsets.end(),
                              col.child_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     }
   }
+  stream.synchronize();
 }
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
   cudf::io::parse_options const& options,
+  bool prune_columns,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto validity_size_check = [](device_json_column& json_col) {
@@ -977,13 +1021,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       for (auto const& col_name : json_col.column_order) {
         auto const& col = json_col.child_columns.find(col_name);
         column_names.emplace_back(col->first);
-        auto& child_col            = col->second;
-        auto [child_column, names] = device_json_column_to_cudf_column(
-          child_col, d_input, options, get_child_schema(col_name), stream, mr);
-        CUDF_EXPECTS(num_rows == child_column->size(),
-                     "All children columns must have the same size");
-        child_columns.push_back(std::move(child_column));
-        column_names.back().children = names;
+        auto& child_col           = col->second;
+        auto child_schema_element = get_child_schema(col_name);
+        if (!prune_columns or child_schema_element.has_value()) {
+          auto [child_column, names] = device_json_column_to_cudf_column(
+            child_col, d_input, options, prune_columns, child_schema_element, stream, mr);
+          CUDF_EXPECTS(num_rows == child_column->size(),
+                       "All children columns must have the same size");
+          child_columns.push_back(std::move(child_column));
+          column_names.back().children = names;
+        }
       }
       auto [result_bitmask, null_count] = make_validity(json_col);
       // The null_mask is set after creation of struct column is to skip the superimpose_nulls and
@@ -1006,8 +1053,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
+      auto child_schema_element = json_col.child_columns.empty()
+                                    ? std::optional<schema_element>{}
+                                    : get_child_schema(json_col.child_columns.begin()->first);
       auto [child_column, names] =
-        json_col.child_columns.empty()
+        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
                       // EMPTY type could not used because gather throws exception on EMPTY type.
                       std::vector<column_name_info>>{std::make_unique<column>(
@@ -1017,13 +1067,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                        rmm::device_buffer{},
                                                        0),
                                                      std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(
-              json_col.child_columns.begin()->second,
-              d_input,
-              options,
-              get_child_schema(json_col.child_columns.begin()->first),
-              stream,
-              mr);
+          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                              d_input,
+                                              options,
+                                              prune_columns,
+                                              child_schema_element,
+                                              stream,
+                                              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       auto ret_col                      = make_lists_column(num_rows,
@@ -1046,7 +1096,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -1135,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   size_type column_index = 0;
   for (auto const& col_name : root_struct_col.column_order) {
     auto& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
@@ -1179,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     debug_schema_print(child_schema_element);
 #endif
 
-    // Get this JSON column's cudf column and schema info, (modifies json_col)
-    auto [cudf_col, col_name_info] = device_json_column_to_cudf_column(
-      json_col, d_input, parse_opt, child_schema_element, stream, mr);
-    // TODO: RangeIndex as DataFrame.columns names for array of arrays
-    // if (is_array_of_arrays) {
-    //   col_name_info.back().name = "";
-    // }
-
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
+    if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) {
+      // Get this JSON column's cudf column and schema info, (modifies json_col)
+      auto [cudf_col, col_name_info] =
+        device_json_column_to_cudf_column(json_col,
+                                          d_input,
+                                          parse_opt,
+                                          options.is_enabled_prune_columns(),
+                                          child_schema_element,
+                                          stream,
+                                          mr);
+      // Insert this column's name into the schema
+      out_column_names.emplace_back(col_name);
+      // TODO: RangeIndex as DataFrame.columns names for array of arrays
+      // if (is_array_of_arrays) {
+      //   col_name_info.back().name = "";
+      // }
+
+      out_column_names.back().children = std::move(col_name_info);
+      out_columns.emplace_back(std::move(cudf_col));
+
+      column_index++;
+    }
   }
 
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index b3a029224d7..ca56a12eb36 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 
@@ -297,9 +298,9 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -307,10 +308,10 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
     fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -318,12 +319,13 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
-rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
@@ -331,10 +333,10 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
     fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -342,7 +344,8 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1b7976dab89..ad807b57766 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
@@ -218,7 +219,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Whether a token does represent a node in the tree representation
@@ -634,7 +635,7 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = parent_node_ids.size();
@@ -779,7 +780,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -848,7 +849,7 @@ rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndex
                                                    bool is_array_of_arrays,
                                                    bool is_enabled_lines,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -947,7 +948,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto [new_col_id, new_parent_col_id] =
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
index 32d05c432b4..2c02fdd402f 100644
--- a/cpp/src/io/json/legacy/read_json.hpp
+++ b/cpp/src/io/json/legacy/read_json.hpp
@@ -17,6 +17,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/mr/memory_resource.h>
 
@@ -32,6 +33,6 @@ namespace cudf::io::json::detail::legacy {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index f9d0f6895b9..846b3cfab4e 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -486,7 +487,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                           device_span<uint64_t const> rec_starts,
                                           device_span<char const> data,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto const num_columns = dtypes.size();
   auto const num_records = rec_starts.size();
@@ -598,7 +599,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
   CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5d54e340e2b..e12892a2d50 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -22,6 +22,8 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <map>
 #include <vector>
 
@@ -57,8 +59,8 @@ enum class stack_behavior_t : char {
   PushPopWithoutReset,
 
   /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
-  /// from the stack. Newline characters are considered delimiters and therefore reset to an empty
-  /// stack.
+  /// from the stack. Delimiter characters are passed when the stack context is constructed to
+  /// reset to an empty stack.
   ResetOnDelimiter
 };
 
@@ -172,7 +174,7 @@ struct device_json_column {
    * @param stream The CUDA stream to which kernels are dispatched
    * @param mr Optional, resource with which to allocate
    */
-  device_json_column(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  device_json_column(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
     : string_offsets(0, stream),
       string_lengths(0, stream),
       child_offsets(0, stream, mr),
@@ -196,11 +198,13 @@ namespace detail {
  * within the context of a struct, a '[' represents that it is within the context of an array, and a
  * '_' symbol that it is at the root of the JSON.
  * @param[in] stack_behavior Specifies the stack's behavior
+ * @param[in] delimiter Specifies the delimiter to use as separator for JSON lines input
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -232,7 +236,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Traverse the tree representation of the JSON input in records orient format and populate
@@ -253,7 +257,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Searches for and selects nodes at level `row_array_children_level`. For each selected
@@ -300,14 +304,21 @@ reduce_to_column_tree(tree_meta_t& tree,
 cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
                                         rmm::cuda_stream_view stream);
 
-/** @copydoc host_parse_nested_json
+/**
+ * @brief Parses the given JSON string and generates table from the given input.
+ *
  * All processing is done in device memory.
  *
+ * @param input The JSON input
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr Optional, resource with which to allocate
+ * @return The data parsed from the given JSON input
  */
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the path data type of a column by path if present in input schema
@@ -317,7 +328,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
  * @return data type of the column if present
  */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options);
 
 /**
@@ -335,20 +346,6 @@ struct path_from_tree {
   std::vector<path_rep> get_path(NodeIndexT this_col_id);
 };
 
-/**
- * @brief Parses the given JSON string and generates table from the given input.
- *
- * @param input The JSON input
- * @param options Parsing options specifying the parsing behaviour
- * @param stream The CUDA stream to which kernels are dispatched
- * @param mr Optional, resource with which to allocate
- * @return The data parsed from the given JSON input
- */
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
-
 }  // namespace detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a6a57c36b08..b243e4ba006 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -36,6 +36,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -130,12 +131,13 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
  * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
  */
 struct SymbolPairToSymbolGroupId {
+  SymbolT delimiter = '\n';
   CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
   {
     auto const input_symbol = thrust::get<0>(symbol);
     auto const stack_symbol = thrust::get<1>(symbol);
     return static_cast<SymbolGroupT>(
-      input_symbol == '\n'
+      input_symbol == delimiter
         ? dfa_symbol_group_id::DELIMITER
         : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
   }
@@ -330,7 +332,7 @@ enum class dfa_symbol_group_id : uint8_t {
   CLOSING_BRACKET,   ///< Closing bracket SG: ]
   QUOTE_CHAR,        ///< Quote character SG: "
   ESCAPE_CHAR,       ///< Escape character SG: '\'
-  NEWLINE_CHAR,      ///< Newline character SG: '\n'
+  DELIMITER_CHAR,    ///< Delimiter character SG
   OTHER_SYMBOLS,     ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
 };
@@ -338,42 +340,64 @@ enum class dfa_symbol_group_id : uint8_t {
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
-  {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(TT_OOS);
 
-// Transition table for the default JSON and JSON lines formats
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
-  {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
-   /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
-   /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
-
-// Transition table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_transition_table{
+template <typename SymbolT>
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'{'}, {'['}, {'}'}, {']'}, {'"'}, {'\\'}, {delim}}};
+
+  return symbol_groups;
+}
+
+auto get_transition_table(stack_behavior_t stack_behavior)
+{
+  // Transition table for the default JSON and JSON lines formats
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
     {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
      /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
-     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
-
-// Translation table for the default JSON and JSON lines formats
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
-  {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-   /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
-   /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
-   /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
-
-// Translation table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_translation_table{
-    {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-     /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
-     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
-     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
+     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
+
+  // Transition table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_transition_table{
+      {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+       /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+       /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
+       /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
+
+  // Transition table specialized on the choice of whether to reset on newlines
+  return (stack_behavior == stack_behavior_t::ResetOnDelimiter) ? resetting_transition_table
+                                                                : transition_table;
+}
+
+auto get_translation_table(stack_behavior_t stack_behavior)
+{
+  // Translation table for the default JSON and JSON lines formats
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+
+  // Translation table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+
+  // Translation table specialized on the choice of whether to reset on newlines
+  return stack_behavior == stack_behavior_t::ResetOnDelimiter ? resetting_translation_table
+                                                              : translation_table;
+}
 
-// The DFA's starting state
-constexpr auto start_state = static_cast<StateT>(TT_OOS);
 }  // namespace to_stack_op
 
 // JSON tokenizer pushdown automaton
@@ -571,6 +595,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
  * visibly pushdown automaton (DVPA)
  */
 struct PdaSymbolToSymbolGroupId {
+  SymbolT delimiter = '\n';
   template <typename SymbolT, typename StackSymbolT>
   __device__ __forceinline__ PdaSymbolGroupIdT
   operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair) const
@@ -592,8 +617,15 @@ struct PdaSymbolToSymbolGroupId {
     // The relative symbol group id of the current input symbol
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
+    // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
+    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // escape, comma, colon or whitespace characters.
+    auto const symbol_position =
+      symbol == delimiter
+        ? static_cast<int32_t>('\n')
+        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
-      tos_sg_to_pda_sgid[min(static_cast<int32_t>(symbol), pda_sgid_lookup_size - 1)];
+      tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
            symbol_gid;
   }
@@ -1397,6 +1429,7 @@ namespace detail {
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream)
 {
   check_input_size(json_in.size());
@@ -1422,20 +1455,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  // Transition table specialized on the choice of whether to reset on newlines
-  const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                  ? to_stack_op::resetting_transition_table
-                                  : to_stack_op::transition_table;
-
-  // Translation table specialized on the choice of whether to reset on newlines
-  const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                   ? to_stack_op::resetting_translation_table
-                                   : to_stack_op::translation_table;
-
   auto json_to_stack_ops_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
-    fst::detail::make_transition_table(transition_table),
-    fst::detail::make_translation_table<max_translation_table_size>(translation_table),
+    fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
+    fst::detail::make_translation_table<max_translation_table_size>(
+      to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1531,23 +1555,23 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   check_input_size(json_in.size());
 
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  auto const new_line_delimited_json = options.is_enabled_lines();
+  auto const delimited_json = options.is_enabled_lines();
+  auto const delimiter      = options.get_delimiter();
 
-  // (!new_line_delimited_json)                         => JSON
-  // (new_line_delimited_json and recover_from_error)   => JSON_LINES_RECOVER
-  // (new_line_delimited_json and !recover_from_error)  => JSON_LINES
-  auto format = new_line_delimited_json
-                  ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
-                       ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
-                       : tokenizer_pda::json_format_cfg_t::JSON_LINES)
-                  : tokenizer_pda::json_format_cfg_t::JSON;
+  // (!delimited_json)                         => JSON
+  // (delimited_json and recover_from_error)   => JSON_LINES_RECOVER
+  // (delimited_json and !recover_from_error)  => JSON_LINES
+  auto format = delimited_json ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
+                                    ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
+                                    : tokenizer_pda::json_format_cfg_t::JSON_LINES)
+                               : tokenizer_pda::json_format_cfg_t::JSON;
 
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
   auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER);
@@ -1558,7 +1582,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Identify what is the stack context for each input character (JSON-root, struct, or list)
   auto const stack_behavior =
     recover_from_error ? stack_behavior_t::ResetOnDelimiter : stack_behavior_t::PushPopWithoutReset;
-  get_stack_context(json_in, stack_symbols.data(), stack_behavior, stream);
+  get_stack_context(json_in, stack_symbols.data(), stack_behavior, delimiter, stream);
 
   // Input to the full pushdown automaton finite-state transducer, where a input symbol comprises
   // the combination of a character from the JSON input together with the stack context for that
@@ -1572,7 +1596,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   if (recover_from_error) {
     auto fix_stack_of_excess_chars = fst::detail::make_fst(
       fst::detail::make_symbol_group_lookup_op(
-        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
       fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
@@ -1583,13 +1607,17 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
                                         thrust::make_discard_iterator(),
                                         fix_stack_of_excess_chars::start_state,
                                         stream);
+
+    // Make sure memory of the FST's lookup tables isn't freed before the FST completes
+    stream.synchronize();
   }
 
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
+
   auto json_to_tokens_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}),
+    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
     fst::detail::make_translation_table<max_translation_table_size>(
       tokenizer_pda::get_translation_table(recover_from_error)),
@@ -1661,7 +1689,7 @@ void make_json_column(json_column& root_column,
                       cudf::io::json_reader_options const& options,
                       bool include_quote_char,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
@@ -2061,7 +2089,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
   cudf::io::json_reader_options const& options,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
@@ -2219,7 +2247,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
                                            cudf::io::json_reader_options const& options,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 740b7523cc1..4caa5cd9e24 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -58,8 +58,15 @@ std::optional<schema_element> child_schema_element(std::string const& col_name,
 // "a": [ null]         {"a", list}, {"element", str}
 // back() is root.
 // front() is leaf.
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the json column
+ * @param root root of input schema element
+ * @return data type of the column if present, otherwise std::nullopt
+ */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
 {
   if (path.empty() || path.size() == 1) {
     return root.type;
@@ -81,7 +88,7 @@ std::optional<data_type> get_path_data_type(
 }
 
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options)
 {
   if (path.empty()) return {};
@@ -98,11 +105,11 @@ std::optional<data_type> get_path_data_type(
 std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
 {
   std::vector<path_rep> path;
-  // TODO Need to stop at row root. so, how to find row root?
+  // stops at root.
   while (this_col_id != parent_node_sentinel) {
     auto type        = column_categories[this_col_id];
     std::string name = "";
-    // TODO make this ifelse into a separate lambda function, along with parent_col_id.
+    // code same as name_and_parent_index lambda.
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
       if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index b03e0dd452b..ea52dce020e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,11 +20,15 @@
 #include "read_json.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
@@ -46,54 +50,70 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 }
 
 /**
- * @brief Read from array of data sources into RMM buffer
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
  *
+ * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
  * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
  */
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                           compression_type compression,
-                                           size_t range_offset,
-                                           size_t range_size,
-                                           rmm::cuda_stream_view stream)
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // We append a line delimiter between two files to make sure the last line of file i and the first
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
-  auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
-
-  // Iterate through the user defined sources and read the contents into the local buffer
-  auto const total_source_size =
-    sources_size(sources, range_offset, range_size) + num_extra_delimiters;
 
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
+    std::vector<size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
     delimiter_map.reserve(sources.size());
-    auto d_buffer     = rmm::device_uvector<char>(total_source_size, stream);
     size_t bytes_read = 0;
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    for (auto const& source : sources) {
-      if (!source->is_empty()) {
-        auto data_size   = (range_size != 0) ? range_size : source->size();
-        auto destination = reinterpret_cast<uint8_t*>(d_buffer.data()) + bytes_read;
-        if (source->is_device_read_preferred(data_size)) {
-          bytes_read += source->device_read(range_offset, data_size, destination, stream);
-        } else {
-          h_buffers.emplace_back(source->host_read(range_offset, data_size));
-          auto const& h_buffer = h_buffers.back();
-          CUDF_CUDA_TRY(cudaMemcpyAsync(
-            destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
-          bytes_read += h_buffer->size();
-        }
-        delimiter_map.push_back(bytes_read);
-        bytes_read += num_delimiter_chars;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<int>{},
+                                  [](const std::unique_ptr<datasource>& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
+        bytes_read += h_buffer->size();
       }
+      range_offset = 0;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
     }
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -101,30 +121,30 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                     "Currently only single-character delimiters are supported");
       auto const delimiter_source = thrust::make_constant_iterator('\n');
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        host_span<size_type const>{delimiter_map.data(), delimiter_map.size() - 1},
-        stream,
-        rmm::mr::get_current_device_resource());
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
       thrust::scatter(rmm::exec_policy_nosync(stream),
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
                       d_delimiter_map.data(),
-                      d_buffer.data());
+                      buffer.data());
     }
-
     stream.synchronize();
-    return d_buffer;
-
-  } else {
-    auto buffer = std::vector<uint8_t>(total_source_size);
-    // Single read because only a single compressed source is supported
-    // Reading to host because decompression of a single block is much faster on the CPU
-    sources[0]->host_read(range_offset, total_source_size, buffer.data());
-    auto const uncomp_data = decompress(compression, buffer);
-    return cudf::detail::make_device_uvector_sync(
-      host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream,
-      rmm::mr::get_current_device_resource());
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
   }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
 }
 
 size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
@@ -132,19 +152,15 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const buffer = ingest_raw_input(sources,
-                                       reader_opts.get_compression(),
-                                       reader_opts.get_byte_range_offset(),
-                                       reader_opts.get_byte_range_size(),
-                                       stream);
-  return find_first_delimiter(buffer, delimiter, stream);
-}
-
-bool should_load_whole_source(json_reader_options const& opts, size_t source_size)
-{
-  auto const range_offset = opts.get_byte_range_offset();
-  auto const range_size   = opts.get_byte_range_size();
-  return range_offset == 0 and (range_size == 0 or range_size >= source_size);
+  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
+  rmm::device_uvector<char> buffer(total_source_size, stream);
+  auto readbufspan = ingest_raw_input(buffer,
+                                      sources,
+                                      reader_opts.get_compression(),
+                                      reader_opts.get_byte_range_offset(),
+                                      reader_opts.get_byte_range_size(),
+                                      stream);
+  return find_first_delimiter(readbufspan, '\n', stream);
 }
 
 /**
@@ -158,66 +174,110 @@ bool should_load_whole_source(json_reader_options const& opts, size_t source_siz
  * @param sources Data sources to read from
  * @param reader_opts JSON reader options with range offset and range size
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Byte range for parsing
+ * @returns Data source owning buffer enclosing the bytes read
  */
-auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                json_reader_options const& reader_opts,
-                                rmm::cuda_stream_view stream)
+datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+  host_span<std::unique_ptr<datasource>> sources,
+  json_reader_options const& reader_opts,
+  rmm::cuda_stream_view stream)
 {
-  auto buffer = ingest_raw_input(sources,
-                                 reader_opts.get_compression(),
-                                 reader_opts.get_byte_range_offset(),
-                                 reader_opts.get_byte_range_size(),
-                                 stream);
-  if (should_load_whole_source(reader_opts, sources[0]->size())) return buffer;
-  auto first_delim_pos =
-    reader_opts.get_byte_range_offset() == 0 ? 0 : find_first_delimiter(buffer, '\n', stream);
+  CUDF_FUNC_RANGE();
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+
+  size_t const total_source_size            = sources_size(sources, 0, 0);
+  auto constexpr num_delimiter_chars        = 1;
+  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  compression_type const reader_compression = reader_opts.get_compression();
+  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
+  size_t chunk_size                         = reader_opts.get_byte_range_size();
+
+  CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
+               "Invalid offsetting");
+  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
+
+  // Some magic numbers
+  constexpr int num_subchunks               = 10;  // per chunk_size
+  constexpr size_t min_subchunk_size        = 10000;
+  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
+  constexpr int estimated_compression_ratio = 4;
+
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+
+  size_t const size_per_subchunk =
+    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+
+  // The allocation for single source compressed input is estimated by assuming a ~4:1
+  // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
+  // of subchunks.
+  auto constexpr header_size = 4096;
+  size_t const buffer_size =
+    reader_compression != compression_type::NONE
+      ? total_source_size * estimated_compression_ratio + header_size
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
+          num_extra_delimiters;
+  rmm::device_uvector<char> buffer(buffer_size, stream);
+  device_span<char> bufspan(buffer);
+
+  // Offset within buffer indicating first read position
+  std::int64_t buffer_offset = 0;
+  auto readbufspan =
+    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+
+  auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
+  auto const first_delim_pos =
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
-    return rmm::device_uvector<char>{0, stream};
-  } else {
-    first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset();
+    // return empty owning datasource buffer
+    auto empty_buf = rmm::device_uvector<char>(0, stream);
+    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+  } else if (!should_load_all_sources) {
     // Find next delimiter
-    decltype(first_delim_pos) next_delim_pos = -1;
-    auto const total_source_size             = sources_size(sources, 0, 0);
-    auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size();
-    while (current_offset < total_source_size and next_delim_pos == -1) {
-      buffer         = ingest_raw_input(sources,
-                                reader_opts.get_compression(),
-                                current_offset,
-                                reader_opts.get_byte_range_size(),
-                                stream);
-      next_delim_pos = find_first_delimiter(buffer, '\n', stream);
-      if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
+    std::int64_t next_delim_pos = -1;
+    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
+      buffer_offset += readbufspan.size();
+      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                     sources,
+                                     reader_compression,
+                                     next_subchunk_start,
+                                     size_per_subchunk,
+                                     stream);
+      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
     }
-    if (next_delim_pos == -1) {
-      next_delim_pos = total_source_size;
-    } else {
-      next_delim_pos = next_delim_pos + current_offset;
-    }
-    return ingest_raw_input(sources,
-                            reader_opts.get_compression(),
-                            first_delim_pos,
-                            next_delim_pos - first_delim_pos,
-                            stream);
+    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
+
+    return datasource::owning_buffer<rmm::device_uvector<char>>(
+      std::move(buffer),
+      reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+      next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+  return datasource::owning_buffer<rmm::device_uvector<char>>(
+    std::move(buffer),
+    reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
+  // TODO remove this if-statement once legacy is removed
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
   if (reader_opts.is_enabled_legacy()) {
     return legacy::read_json(sources, reader_opts, stream, mr);
   }
+#pragma GCC diagnostic pop
 
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
-    CUDF_EXPECTS(sources.size() == 1,
-                 "Specifying a byte range is supported only for a single source");
   }
 
   if (sources.size() > 1) {
@@ -227,24 +287,25 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+    get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    buffer =
-      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    buffer =
-      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
+  auto buffer =
+    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
+  stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
-  // For debug purposes, use host_parse_nested_json()
 }
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index d05134fa837..0c30b4cad46 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -31,7 +32,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 8c3aceeefd4..997d6fd99f8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -47,6 +47,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -77,8 +78,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -122,7 +124,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -162,12 +164,12 @@ struct escape_strings_fn {
     constexpr char_utf8 const colon = ':';  // append colon
     if (append_colon) write_char(colon, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     auto [offsets_column, chars] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
@@ -256,7 +258,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                           string_scalar const& narep,
                                           bool include_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(column_names.type().id() == type_id::STRING, "Column names must be of type string");
@@ -373,7 +375,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                                              string_view const element_separator,
                                              string_view const element_narep,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +499,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(json_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options),
       stream_(stream),
       mr_(mr),
@@ -740,7 +742,7 @@ struct column_to_strings_fn {
  private:
   json_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
   string_scalar const narep;  // "null"
   // struct convert constants
   string_scalar const struct_value_separator;  // ","
@@ -804,7 +806,7 @@ void write_chunked(data_sink* out_sink,
                    int const skip_last_chars,
                    json_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -828,7 +830,7 @@ void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index f5f540bc3a4..94a4d146b35 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
 
 #include "io/utilities/row_selection.hpp"
 
@@ -152,22 +152,28 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
+  std::optional<size_type> const& num_read_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+  CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(),
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
-    if (not user_specified_stripes.empty()) { return std::pair<int64_t, size_type>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
+    if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
+    return cudf::io::detail::skip_rows_num_rows_from_options(
+      skip_rows, num_read_rows, get_num_rows());
   }();
 
-  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+  struct stripe_source_mapping {
+    int source_idx;
+    std::vector<metadata::orc_stripe_info> stripe_info;
+  };
+
+  std::vector<stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
     CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
@@ -176,7 +182,8 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(user_specified_stripes[src_file_idx].size());
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -185,46 +192,65 @@ aggregate_orc_metadata::select_stripes(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
-        stripe_infos.push_back(
-          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-        rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                nullptr,
+                                static_cast<int>(src_file_idx)});
+
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        rows_to_read += static_cast<int64_t>(stripe_rows);
       }
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
   } else {
     int64_t count            = 0;
     int64_t stripe_skip_rows = 0;
-    // Iterate all source files, each source file has corelating metadata
+    // Iterate all source files, each source file has correlating metadata
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size());
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        count += static_cast<int64_t>(stripe_rows);
+
         if (count > rows_to_skip || count == 0) {
-          stripe_infos.push_back(
-            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                  nullptr,
+                                  static_cast<int>(src_file_idx)});
         } else {
           stripe_skip_rows = count;
         }
       }
 
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
   }
 
+  std::vector<metadata::orc_stripe_info> output;
+
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
     // Resize to all stripe_info for the source level
     per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
     for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-      auto const stripe         = mapping.stripe_info[i].first;
+      auto const stripe         = mapping.stripe_info[i].stripe_info;
       auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
       auto const sf_comp_length = stripe->footerLength;
       CUDF_EXPECTS(
@@ -236,12 +262,17 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()}, stream);
       ProtobufReader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      mapping.stripe_info[i].stripe_footer =
+        &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
+
+    output.insert(output.end(),
+                  std::make_move_iterator(mapping.stripe_info.begin()),
+                  std::make_move_iterator(mapping.stripe_info.end()));
   }
 
-  return {rows_to_skip, rows_to_read, selected_stripes_mapping};
+  return {rows_to_skip, rows_to_read, std::move(output)};
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index d1e053be481..5da5af58b9b 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -45,8 +45,6 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -114,12 +112,22 @@ class aggregate_orc_metadata {
    * @brief Selects the stripes to read, based on the row/stripe selection parameters.
    *
    * Stripes are potentially selected from multiple files.
+   *
+   * Upon parsing stripes' information, the number of skip rows and reading rows are also updated
+   * to be matched with the actual numbers for reading stripes from data sources.
+   *
+   * @param user_specified_stripes The specified stripe indices to read
+   * @param skip_rows Number of rows to skip from reading
+   * @param num_read_rows Number of rows to read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
+   *         stripes' metadata such as footer, data information, and source index
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
-  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
-                 int64_t skip_rows,
-                 std::optional<size_type> const& num_rows,
-                 rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    int64_t skip_rows,
+    std::optional<size_type> const& num_read_rows,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 88bd260a598..fd55cbb6846 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -602,13 +602,13 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
  public:
-  struct stripe_source_mapping {
+  struct orc_stripe_info {
+    StripeInformation const* stripe_info;
+    StripeFooter const* stripe_footer;
     int source_idx;
-    std::vector<OrcStripeInfo> stripe_info;
   };
+  std::vector<orc_stripe_info> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f078e20f7e6..621d4c67691 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,40 +14,100 @@
  * limitations under the License.
  */
 
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+
+#include <cudf/detail/copy.hpp>
+
+#include <algorithm>
 
 namespace cudf::io::orc::detail {
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _stream(stream),
-    _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())}
+// This is just the proxy to call all other data preprocessing functions.
+void reader_impl::prepare_data(read_mode mode)
 {
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  // This will be no-op if it was called before.
+  preprocess_file(mode);
+
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) {
+      // Only load stripe data if:
+      //  - There is more stripe to load, and
+      //  - All loaded stripes were decoded, and
+      //  - All the decoded results were output.
+      load_next_stripe_data(mode);
+    }
+    if (_chunk_read_data.more_stripes_to_decode()) {
+      // Only decompress/decode the loaded stripes if:
+      //  - There are loaded stripes that were not decoded yet, and
+      //  - All the decoded results were output.
+      decompress_and_decode_stripes(mode);
+    }
+  }
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
+table_with_metadata reader_impl::make_output_chunk()
 {
-  prepare_data(skip_rows, num_rows_opt, stripes);
-  return read_chunk_internal();
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  // If no rows or stripes to read, return empty columns.
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    std::vector<std::unique_ptr<column>> out_columns;
+    auto out_metadata = get_meta_with_user_data();
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const& col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     return create_empty_column(col_meta.id,
+                                                _metadata,
+                                                _options.decimal128_columns,
+                                                _options.use_np_dtypes,
+                                                _options.timestamp_type,
+                                                out_metadata.schema_info.back(),
+                                                _stream);
+                   });
+    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+  }
+
+  auto const make_output_table = [&] {
+    if (_chunk_read_data.output_table_ranges.size() == 1) {
+      // Must change the index of the current output range such that calling `has_next()` after
+      // this will return the correct answer (`false`, since there is only one range).
+      _chunk_read_data.curr_output_table_range++;
+
+      // Just hand over the decoded table without slicing.
+      return std::move(_chunk_read_data.decoded_table);
+    }
+
+    // The range of rows in the decoded table to output.
+    auto const out_range =
+      _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
+    auto const out_tview = cudf::detail::slice(
+      _chunk_read_data.decoded_table->view(),
+      {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
+      _stream)[0];
+    auto output = std::make_unique<table>(out_tview, _stream, _mr);
+
+    // If this is the last slice, we also delete the decoded table to free up memory.
+    if (!_chunk_read_data.more_table_chunks_to_output()) {
+      _chunk_read_data.decoded_table.reset(nullptr);
+    }
+
+    return output;
+  };
+
+  return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
-table_metadata reader::impl::make_output_metadata()
+table_metadata reader_impl::get_meta_with_user_data()
 {
-  if (_output_metadata) { return table_metadata{*_output_metadata}; }
+  if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -68,69 +128,126 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_output_metadata` for reuse next time.
-  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_meta_with_user_data` for reuse next time.
+  _meta_with_user_data = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
 
-table_with_metadata reader::impl::read_chunk_internal()
+reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr)
 {
-  // There is no columns in the table.
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+}
 
-  std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = make_output_metadata();
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(chunk_read_limit,
+                             pass_read_limit,
+                             DEFAULT_OUTPUT_ROW_GRANULARITY,
+                             std::move(sources),
+                             options,
+                             stream,
+                             mr)
+{
+}
 
-  // If no rows or stripes to read, return empty columns
-  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
-    std::transform(_selected_columns.levels[0].begin(),
-                   _selected_columns.levels[0].end(),
-                   std::back_inserter(out_columns),
-                   [&](auto const col_meta) {
-                     out_metadata.schema_info.emplace_back("");
-                     return create_empty_column(col_meta.id,
-                                                _metadata,
-                                                _decimal128_columns,
-                                                _use_np_dtypes,
-                                                _timestamp_type,
-                                                out_metadata.schema_info.back(),
-                                                _stream);
-                   });
-    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-  }
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         size_type output_row_granularity,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : _stream(stream),
+    _mr(mr),
+    _options{options.get_timestamp_type(),
+             options.is_enabled_use_index(),
+             options.is_enabled_use_np_dtypes(),
+             options.get_decimal128_columns(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_stripes()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity}
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns.
+  CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested column");
+}
 
-  // Create columns from buffer with respective schema information.
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+table_with_metadata reader_impl::read()
+{
+  prepare_data(read_mode::READ_ALL);
+  return make_output_chunk();
 }
 
-// Forward to implementation
+bool reader_impl::has_next()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return _chunk_read_data.has_next();
+}
+
+table_with_metadata reader_impl::read_chunk()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return make_output_chunk();
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(
+      chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)}
+{
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(chunk_read_limit,
+                                        pass_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
+{
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
 
-// Destructor within this translation unit
 reader::~reader() = default;
 
-// Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options)
-{
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index ab8eaebeb61..94b294087b8 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,14 +16,17 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
-#include "io/utilities/column_buffer.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <io/utilities/column_buffer.hpp>
 
 #include <memory>
 #include <optional>
@@ -32,83 +35,169 @@
 namespace cudf::io::orc::detail {
 
 struct reader_column_meta;
-struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    *
+   * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit`
+   * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
 
   /**
-   * @brief Read an entire set or a subset of data and returns a set of columns
-   *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
-   * @return The set of columns along with metadata
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   */
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows_opt,
-                           std::vector<std::vector<size_type>> const& stripes);
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       size_type output_row_granularity,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::reader::read
+   */
+  table_with_metadata read();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  bool has_next();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  table_with_metadata read_chunk();
 
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
+   * This is the proxy to call all other data preprocessing functions, which are prerequisite
+   * for generating the output.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes);
+  void prepare_data(read_mode mode);
 
   /**
-   * @brief Create the output table metadata from file metadata.
+   * @brief Perform a preprocessing step on the input data sources that executes exactly once
+   * for the entire duration of the reader.
    *
-   * @return Columns' metadata to output with the table read from file
+   * In this step, the metadata of all stripes in the data sources is parsed, and information about
+   * data streams of the selected columns in all stripes are generated. If the reader has a data
+   * read limit, sizes of these streams are used to split the list of all stripes into multiple
+   * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+   * subsets are computed such that memory usage will be kept to be around a fixed size limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  table_metadata make_output_metadata();
+  void preprocess_file(read_mode mode);
 
   /**
-   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   * @brief Load stripes from the input data sources into memory.
+   *
+   * If there is a data read limit, only a subset of stripes are read at a time such that
+   * their total data size does not exceed a fixed size limit. Then, the data is probed to
+   * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+   * smaller subsets, each of which to be decompressed and decoded in the next step
+   * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+   * together with decompression and decoding will be capped around the given data read limit.
    *
-   * This function is called internally and expects all preprocessing steps have already been done.
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void load_next_stripe_data(read_mode mode);
+
+  /**
+   * @brief Decompress and decode stripe data in the internal buffers, and store the result into
+   * an intermediate table.
+   *
+   * This function expects that the other preprocessing steps (`global preprocess()` and
+   * `load_next_stripe_data()`) have already been done.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void decompress_and_decode_stripes(read_mode mode);
+
+  /**
+   * @brief Create the output table from the intermediate table and return it along with metadata.
    *
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal();
+  table_with_metadata make_output_chunk();
 
-  rmm::cuda_stream_view const _stream;
-  rmm::mr::device_memory_resource* const _mr;
+  /**
+   * @brief Create the output table metadata storing user data in source metadata.
+   *
+   * @return Columns' user data to output with the table read from file
+   */
+  table_metadata get_meta_with_user_data();
 
-  // Reader configs
-  data_type const _timestamp_type;  // Override output timestamp resolution
-  bool const _use_index;            // Enable or disable attempt to use row index for parsing
-  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
+  rmm::cuda_stream_view const _stream;
+  rmm::device_async_resource_ref const _mr;
+
+  // Reader configs.
+  struct {
+    data_type timestamp_type;  // override output timestamp resolution
+    bool use_index;            // enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // control decimals conversion
+
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_read_rows;
+    std::vector<std::vector<size_type>> const selected_stripes;
+  } const _options;
+
+  // Intermediate data for reading.
   std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
-
-  // Intermediate data for internal processing.
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
-  std::unique_ptr<file_intermediate_data> _file_itm_data;
-  std::unique_ptr<table_metadata> _output_metadata;
+  file_intermediate_data _file_itm_data;
+  chunk_read_data _chunk_read_data;
+
+  // Intermediate data for output.
+  std::unique_ptr<table_metadata> _meta_with_user_data;
+  table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
+
+  // The default value used for subdividing the decoded table for final output.
+  // Larger values will reduce the computation time but will make the output table less granular.
+  // Smaller values (minimum is `1`) will increase the computation time but the output table will
+  // have size closer to the given `chunk_read_limit`.
+  static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
new file mode 100644
index 00000000000..5034aa14a95
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/comp/gpuinflate.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+
+#include <algorithm>
+#include <tuple>
+
+namespace cudf::io::orc::detail {
+
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+{
+  CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
+               "Either stream_info or chunks must be provided, but not both.");
+
+  std::size_t src_offset = 0;
+  std::size_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      // Ignore reading this stream from source.
+      CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
+      src_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
+          stream.kind == orc::PRESENT) {
+        for (auto const& idx : schema_type.subtypes) {
+          auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+          if (child_idx >= 0) {
+            col = child_idx;
+            if (chunks) {
+              auto& chunk                     = (*chunks)[stripe_id][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    } else if (col != -1) {
+      if (chunks) {
+        if (src_offset >= stripeinfo->indexLength || use_index) {
+          auto const index_type = get_stream_index_type(stream.kind);
+          if (index_type < gpu::CI_NUM_STREAMS) {
+            auto& chunk                = (*chunks)[stripe_id][col];
+            chunk.strm_id[index_type]  = *local_stream_order;
+            chunk.strm_len[index_type] = stream.length;
+            // NOTE: skip_count field is temporarily used to track the presence of index streams
+            chunk.skip_count |= 1 << index_type;
+
+            if (index_type == gpu::CI_DICTIONARY) {
+              chunk.dictionary_start = *num_dictionary_entries;
+              chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+              *num_dictionary_entries +=
+                static_cast<int64_t>(stripefooter->columns[column_id].dictionarySize);
+            }
+          }
+        }
+
+        (*local_stream_order)++;
+      } else {  // chunks == nullptr
+        stream_info->emplace_back(
+          orc_stream_info{stripeinfo->offset + src_offset,
+                          dst_offset,
+                          stream.length,
+                          stream_source_info{stripe_id, level, column_id, stream.kind}});
+      }
+
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit)
+{
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument);
+
+  std::vector<range> splits;
+  std::size_t cur_count{0};
+  int64_t cur_pos{0};
+  std::size_t cur_cumulative_size{0};
+
+  [[maybe_unused]] std::size_t cur_cumulative_rows{0};
+
+  auto const start = thrust::make_transform_iterator(
+    cumulative_sizes.begin(),
+    [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + cumulative_sizes.size();
+
+  while (cur_count < total_count) {
+    int64_t split_pos = static_cast<int64_t>(
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit)));
+
+    // If we're past the end, or if the returned range has size exceeds the given size limit,
+    // move back one position.
+    if (split_pos >= static_cast<int64_t>(cumulative_sizes.size()) ||
+        (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) {
+      split_pos--;
+    }
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      // Similarly, while the returned range has total number of rows exceeds column size limit,
+      // move back one position.
+      while (split_pos > 0 && cumulative_sizes[split_pos].num_rows >
+                                cur_cumulative_rows +
+                                  static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+        split_pos--;
+      }
+    }
+
+    // In case we have moved back too much in the steps above, far beyond the last split point, that
+    // means we could not find any range that has size fits within the given size limit.
+    // In such situations, we need to move forward until we move pass the last output range.
+    while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
+           (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
+      split_pos++;
+    }
+
+    auto const start_count = cur_count;
+    cur_count              = cumulative_sizes[split_pos].count;
+    splits.emplace_back(range{start_count, cur_count});
+    cur_pos             = split_pos;
+    cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      cur_cumulative_rows = cumulative_sizes[split_pos].num_rows;
+    }
+  }
+
+  // If the last range has size smaller than `merge_threshold` the size of the second last one,
+  // merge it with the second last one.
+  // This is to prevent having the last range too small.
+  if (splits.size() > 1) {
+    double constexpr merge_threshold = 0.15;
+    if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
+        last.size() <= static_cast<std::size_t>(merge_threshold * second_last.size())) {
+      splits.pop_back();
+      splits.back().end = last.end;
+    }
+  }
+
+  return splits;
+}
+
+// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be
+// used outside of this TU.
+template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+                                                         std::size_t total_count,
+                                                         std::size_t size_limit);
+template std::vector<range> find_splits<cumulative_size_and_row>(
+  host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
+
+// In this step, the metadata of all stripes in the data sources is parsed, and information about
+// data streams of the selected columns in all stripes are generated. If the reader has a data
+// read limit, sizes of these streams are used to split the list of all stripes into multiple
+// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+// subsets are computed such that memory usage will be kept to be around a fixed size limit.
+void reader_impl::preprocess_file(read_mode mode)
+{
+  if (_file_itm_data.global_preprocessed) { return; }
+  _file_itm_data.global_preprocessed = true;
+
+  //
+  // Load stripes' metadata:
+  //
+  std::tie(
+    _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
+    _metadata.select_stripes(
+      _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream);
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(
+    mode == read_mode::CHUNKED_READ ||
+      _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit. "
+    "For reading large number of rows, please use chunked_reader.",
+    std::overflow_error);
+
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = selected_stripes.size();
+  auto const num_levels        = _selected_columns.num_levels();
+
+  // Set up table for converting timestamp columns from local to UTC time
+  _file_itm_data.tz_table = [&] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  //
+  // Pre allocate necessary memory for data processed in the other reading steps:
+  //
+  auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_ranges.resize(num_total_stripes);
+
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_column_types         = _file_itm_data.lvl_column_types;
+  auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
+
+  lvl_stripe_data.resize(num_levels);
+  lvl_stripe_sizes.resize(num_levels);
+  lvl_stream_info.resize(num_levels);
+  lvl_stripe_stream_ranges.resize(num_levels);
+  lvl_column_types.resize(num_levels);
+  lvl_nested_cols.resize(num_levels);
+  _out_buffers.resize(num_levels);
+
+  auto& read_info = _file_itm_data.data_read_info;
+  auto& col_meta  = *_col_meta;
+
+  //
+  // Collect columns' types:
+  //
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    lvl_stripe_sizes[level].resize(num_total_stripes);
+    lvl_stripe_stream_ranges[level].resize(num_total_stripes);
+
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    auto const& columns_level = _selected_columns.levels[level];
+    size_type col_id{0};
+
+    for (auto const& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+
+      auto const col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _options.use_np_dtypes,
+                     _options.timestamp_type.id(),
+                     to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+
+      auto& column_types = lvl_column_types[level];
+      auto& nested_cols  = lvl_nested_cols[level];
+
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column.
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Try to reserve some memory, but the final size is unknown,
+    // since each column may have more than one stream.
+    auto const num_columns = columns_level.size();
+    lvl_stream_info[level].reserve(num_total_stripes * num_columns);
+    if (read_info.capacity() < num_total_stripes * num_columns) {
+      read_info.reserve(num_total_stripes * num_columns);
+    }
+  }
+
+  //
+  // Collect all data streams' information:
+  //
+
+  // Load all stripes if we are in READ_ALL mode or there is no read limit.
+  auto const load_all_stripes =
+    mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
+
+  // Accumulate data size for data streams in each stripe, used for chunking.
+  // This will be used only for CHUNKED_READ mode when there is a read limit.
+  // Otherwise, we do not need this since we just load all stripes.
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
+    load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
+
+  for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
+       ++stripe_global_idx) {
+    auto const& stripe       = selected_stripes[stripe_global_idx];
+    auto const stripe_info   = stripe.stripe_info;
+    auto const stripe_footer = stripe.stripe_footer;
+
+    std::size_t this_stripe_size{0};
+    auto const last_read_size = read_info.size();
+    for (std::size_t level = 0; level < num_levels; ++level) {
+      auto& stream_info = _file_itm_data.lvl_stream_info[level];
+
+      auto stream_level_count = stream_info.size();
+      auto const stripe_level_size =
+        gather_stream_info_and_column_desc(stripe_global_idx,
+                                           level,
+                                           stripe_info,
+                                           stripe_footer,
+                                           col_meta.orc_col_map[level],
+                                           _metadata.get_types(),
+                                           false,  // use_index,
+                                           level == 0,
+                                           nullptr,  // num_dictionary_entries
+                                           nullptr,  // local_stream_order
+                                           &stream_info,
+                                           nullptr  // chunks
+        );
+
+      auto const is_stripe_data_empty = stripe_level_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
+      this_stripe_size += stripe_level_size;
+
+      // Range of the streams in `stream_info` corresponding to this stripe at the current level.
+      lvl_stripe_stream_ranges[level][stripe_global_idx] =
+        range{stream_level_count, stream_info.size()};
+
+      // Coalesce consecutive streams into one read.
+      while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
+        auto const d_dst  = stream_info[stream_level_count].dst_pos;
+        auto const offset = stream_info[stream_level_count].offset;
+        auto len          = stream_info[stream_level_count].length;
+        stream_level_count++;
+
+        while (stream_level_count < stream_info.size() &&
+               stream_info[stream_level_count].offset == offset + len) {
+          len += stream_info[stream_level_count].length;
+          stream_level_count++;
+        }
+        read_info.emplace_back(stream_data_read_info{offset,
+                                                     d_dst,
+                                                     len,
+                                                     static_cast<std::size_t>(stripe.source_idx),
+                                                     stripe_global_idx,
+                                                     level});
+      }
+    }  // end loop level
+
+    if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; }
+
+    // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
+    stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
+  }
+
+  //
+  // Split range of all stripes into subranges that can be loaded separately while maintaining
+  // the memory usage under the given pass limit:
+  //
+
+  // Load range is reset to start from the first position in `load_stripe_ranges`.
+  _chunk_read_data.curr_load_stripe_range = 0;
+
+  if (load_all_stripes) {
+    _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
+    return;
+  }
+
+  // Compute the prefix sum of stripes' data sizes.
+  total_stripe_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         total_stripe_sizes.d_begin(),
+                         total_stripe_sizes.d_end(),
+                         total_stripe_sizes.d_begin(),
+                         cumulative_size_plus{});
+  total_stripe_sizes.device_to_host_sync(_stream);
+
+  auto const load_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::load_limit_ratio);
+    // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.load_stripe_ranges =
+    find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
+}
+
+// If there is a data read limit, only a subset of stripes are read at a time such that
+// their total data size does not exceed a fixed size limit. Then, the data is probed to
+// estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+// smaller subsets, each of which to be decompressed and decoded in the next step
+// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+// together with decompression and decoding will be capped around the given data read limit.
+void reader_impl::load_next_stripe_data(read_mode mode)
+{
+  if (!_file_itm_data.has_data()) { return; }
+
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
+  auto const stripe_start = load_stripe_range.begin;
+  auto const stripe_count = load_stripe_range.size();
+
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
+
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_count);
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
+    }
+  }
+
+  //
+  // Load stripe data into memory:
+  //
+
+  // If we load data from sources into host buffers, we need to transfer (async) data to device
+  // memory. Such host buffers need to be kept alive until we sync the transfers.
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+
+  // If we load data directly from sources into device memory, the loads are also async.
+  // Thus, we need to make sure to sync all them at the end.
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
+
+  // Range of the read info (offset, length) to read for the current being loaded stripes.
+  auto const [read_begin, read_end] =
+    merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
+
+  for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+    auto const& read_info = _file_itm_data.data_read_info[read_idx];
+    auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
+
+    if (source_ptr->is_device_read_preferred(read_info.length)) {
+      device_read_tasks.push_back(
+        std::pair(source_ptr->device_read_async(
+                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+                  read_info.length));
+
+    } else {
+      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
+      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
+                                    buffer->data(),
+                                    read_info.length,
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
+      host_read_buffers.emplace_back(std::move(buffer));
+    }
+  }
+
+  if (host_read_buffers.size() > 0) {  // if there was host read
+    _stream.synchronize();
+    host_read_buffers.clear();  // its data was copied to device memory after stream sync
+  }
+  for (auto& task : device_read_tasks) {  // if there was device read
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
+
+  // Compute number of rows in the loading stripes.
+  auto const num_loading_rows = std::accumulate(
+    _file_itm_data.selected_stripes.begin() + stripe_start,
+    _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
+    std::size_t{0},
+    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
+
+  // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
+  _chunk_read_data.curr_decode_stripe_range = 0;
+
+  // The cudf's column size limit.
+  auto constexpr column_size_limit =
+    static_cast<std::size_t>(std::numeric_limits<size_type>::max());
+
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode,
+  // and the number of loading rows is less than the column size limit.
+  // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
+  // However, we do not have any good way to know how many stripes are 'enough'.
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
+      // In addition to read limit, we also need to check if the total number of
+      // rows in the loaded stripes exceeds the column size limit.
+      // If that is the case, we cannot decode all stripes at once into a cudf table.
+      num_loading_rows <= column_size_limit) {
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
+    return;
+  }
+
+  // From here, we have reading mode that is either:
+  // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ with a pass read limit.
+  // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in
+  // `preprocess_file`. We just check again to make sure such situations never happen here.
+  CUDF_EXPECTS(
+    mode != read_mode::READ_ALL,
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.");
+
+  // This is the post-processing step after we've done with splitting `load_stripe_range` into
+  // `decode_stripe_ranges`.
+  auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
+    // The split ranges always start from zero.
+    // We need to change these ranges to start from `stripe_start` which are the correct subranges
+    // of the current loaded stripe range.
+    for (auto& range : new_ranges) {
+      range.begin += stripe_start;
+      range.end += stripe_start;
+    }
+  };
+
+  // Optimized code path when we do not have any read limit but the number of rows in the
+  // loaded stripes exceeds column size limit.
+  // Note that the values `max_uncompressed_size` for each stripe are not computed here.
+  // Instead, they will be computed on the fly during decoding to avoid the overhead of
+  // storing and retrieving from memory.
+  if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) {
+    std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
+    std::size_t rows{0};
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+      auto const stripe_info = stripe.stripe_info;
+      rows += stripe_info->numberOfRows;
+
+      // We will split stripe ranges based only on stripes' number of rows, not data size.
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
+      // will use the column size limit as the split size limit.
+      cumulative_stripe_rows[idx] =
+        cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
+    }
+
+    _chunk_read_data.decode_stripe_ranges =
+      find_splits<cumulative_size_and_row>(cumulative_stripe_rows, stripe_count, column_size_limit);
+    add_range_offset(_chunk_read_data.decode_stripe_ranges);
+    return;
+  }
+
+  //
+  // Split range of loaded stripes into subranges that can be decoded separately such that the
+  // memory usage is maintained around the given limit:
+  //
+
+  // This is for estimating the decompressed sizes of the loaded stripes.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
+                                                                               _stream);
+
+  // Fill up the `cumulative_size_and_row` array with initial values.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name.
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_decomp_sizes[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+  compinfo_map.clear();  // clear cache of the last load
+
+  // For parsing decompression data.
+  // We create an array that is large enough to use for all levels, thus only need to allocate
+  // memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the loaded stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+  }();
+
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+    auto const num_columns  = _selected_columns.levels[level].size();
+
+    auto& stripe_data = lvl_stripe_data[level];
+    if (stripe_data.empty()) { continue; }
+
+    // Range of all streams in the loaded stripes.
+    auto const stream_range =
+      merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+        compinfo[stream_idx - stream_range.begin] =
+          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+      }
+
+      // Estimate the uncompressed data.
+      compinfo.host_to_device_async(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info           = stream_info[stream_idx];
+        auto const stream_compinfo = compinfo[stream_idx - stream_range.begin];
+
+        // Cache these parsed numbers so they can be reused in the decompression/decoding step.
+        compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks,
+                                     stream_compinfo.num_uncompressed_blocks,
+                                     stream_compinfo.max_uncompressed_size};
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes +=
+          stream_compinfo.max_uncompressed_size;
+      }
+
+    } else {  // no decompression
+      // Set decompression sizes equal to the input sizes.
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
+      }
+    }
+  }  // end loop level
+
+  // Compute the prefix sum of stripe data sizes and rows.
+  stripe_decomp_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         stripe_decomp_sizes.d_begin(),
+                         stripe_decomp_sizes.d_end(),
+                         stripe_decomp_sizes.d_begin(),
+                         cumulative_size_plus{});
+  stripe_decomp_sizes.device_to_host_sync(_stream);
+
+  auto const decode_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::decompress_and_decode_limit_ratio);
+    // Make sure not to pass 0 byte limit to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.decode_stripe_ranges =
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+
+  add_range_offset(_chunk_read_data.decode_stripe_ranges);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0ad0f9af589..4ef68ee8d86 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -24,18 +24,298 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <unordered_map>
+
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being read.
+ * @brief Struct representing a range of of data offsets.
+ */
+struct range {
+  std::size_t begin{0};
+  std::size_t end{0};
+
+  [[nodiscard]] auto size() const { return end - begin; }
+};
+
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
+inline range merge_selected_ranges(host_span<range const> input_ranges,
+                                   range const& selected_ranges)
+{
+  // The first and last range.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
+
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
+}
+
+// Store information to identify where to read a chunk of data from source.
+// Each read corresponds to one or more consecutive streams combined.
+struct stream_data_read_info {
+  uint64_t offset;         // offset in data source
+  std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;      // data length to read
+  std::size_t source_idx;  // the data source id
+  std::size_t stripe_idx;  // global stripe index
+  std::size_t level;       // nested level
+};
+
+/**
+ * @brief Compression information for a stripe at a specific nested level.
+ */
+struct stripe_level_comp_info {
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
+};
+
+/**
+ * @brief Struct that stores source information of an ORC streams.
+ */
+struct stream_source_info {
+  std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
+  uint32_t orc_col_idx;    // orc column id
+  StreamKind kind;         // stream kind
+
+  struct hash {
+    std::size_t operator()(stream_source_info const& id) const
+    {
+      auto const col_kind =
+        static_cast<std::size_t>(id.orc_col_idx) | (static_cast<std::size_t>(id.kind) << 32);
+      auto const hasher = std::hash<size_t>{};
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind);
+    }
+  };
+  struct equal_to {
+    bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
+    {
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+    }
+  };
+};
+
+/**
+ * @brief Map to lookup a value from stream source.
+ */
+template <typename T>
+using stream_source_map =
+  std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
+
+/**
+ * @brief Struct that stores information of an ORC stream.
+ */
+struct orc_stream_info {
+  // Data info:
+  uint64_t offset;      // offset in data source
+  std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;   // stream length to read
+
+  // Store source of the stream in the stripe, so we can look up where this stream comes from.
+  stream_source_info source;
+};
+
+/**
+ * @brief Struct storing intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
+  int64_t rows_to_skip;
+  int64_t rows_to_read;
+  std::vector<metadata::orc_stripe_info> selected_stripes;
+
+  // Check if there is data to read.
+  bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); }
+
+  // For each stripe, we perform a number of reads for its streams.
+  // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
+  std::vector<range> stripe_data_read_ranges;
+
+  // Identify what data to read from source.
+  std::vector<stream_data_read_info> data_read_info;
+
+  // Store the compression information for each data stream.
+  stream_source_map<stripe_level_comp_info> compinfo_map;
+
+  // Store info for each ORC stream at each nested level.
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
+  // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+  // This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<range>> lvl_stripe_stream_ranges;
+
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
-  int64_t rows_to_skip;
-  size_type rows_to_read;
-  std::vector<metadata::stripe_source_mapping> selected_stripes;
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
+  // List of column data types at each nested level.
+  std::vector<std::vector<data_type>> lvl_column_types;
+
+  // List of nested type columns at each nested level.
+  std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
+
+  // Table for converting timestamp columns from local to UTC time.
+  std::unique_ptr<cudf::table> tz_table;
+
+  bool global_preprocessed{false};
+};
+
+/**
+ * @brief Struct collecting data necessary for chunked reading.
+ */
+struct chunk_read_data {
+  explicit chunk_read_data(std::size_t output_size_limit_,
+                           std::size_t data_read_limit_,
+                           size_type output_row_granularity_)
+    : chunk_read_limit{output_size_limit_},
+      pass_read_limit{data_read_limit_},
+      output_row_granularity{output_row_granularity_}
+  {
+    CUDF_EXPECTS(output_row_granularity > 0,
+                 "The value of `output_row_granularity` must be positive.");
+  }
+
+  std::size_t const
+    chunk_read_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const pass_read_limit;  // approximate maximum size (in bytes) used for store
+                                      // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;
+
+  // Memory limits for loading data and decoding are computed as
+  // `*_limit_ratio * pass_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
+  // Note that sum of these limits may not be `1.0`, and their values are set empirically.
+  static double constexpr load_limit_ratio{0.25};
+  static double constexpr decompress_and_decode_limit_ratio{0.6};
+
+  // Chunks of stripes that can be loaded into memory such that their data size is within the user
+  // specified limit.
+  std::vector<range> load_stripe_ranges;
+  std::size_t curr_load_stripe_range{0};
+  bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
+
+  // Chunks of stripes such that their decompression size is within the user specified size limit.
+  std::vector<range> decode_stripe_ranges;
+  std::size_t curr_decode_stripe_range{0};
+  bool more_stripes_to_decode() const
+  {
+    return curr_decode_stripe_range < decode_stripe_ranges.size();
+  }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<range> output_table_ranges;
+  std::size_t curr_output_table_range{0};
+  std::unique_ptr<cudf::table> decoded_table;
+  bool more_table_chunks_to_output() const
+  {
+    return curr_output_table_range < output_table_ranges.size();
+  }
+
+  bool has_next() const
+  {
+    // Only has more chunk to output if:
+    return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output();
+  }
+};
+
+/**
+ * @brief Struct to accumulate counts and sizes of some types such as stripes or rows.
+ */
+struct cumulative_size {
+  std::size_t count{0};
+  std::size_t size_bytes{0};
 };
 
+/**
+ * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
+ * rows in tables.
+ */
+struct cumulative_size_and_row : public cumulative_size {
+  std::size_t num_rows{0};
+};
+
+/**
+ * @brief Functor to sum up cumulative data.
+ */
+struct cumulative_size_plus {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+
+  __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
+                                                cumulative_size_and_row const& b) const
+  {
+    return cumulative_size_and_row{
+      a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows};
+  }
+};
+
+/**
+ * @brief Find the splits of the input data such that each split range has cumulative size less than
+ * a given `size_limit`.
+ *
+ * Note that the given limit is just a soft limit. The function will always output ranges that
+ * have at least one count, even such ranges have sizes exceed the value of `size_limit`.
+ *
+ * @param cumulative_sizes The input cumulative sizes to compute split ranges
+ * @param total_count The total count in the entire input
+ * @param size_limit The given soft limit to compute splits; must be positive
+ * @return A vector of ranges as splits of the input
+ */
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit);
+
+/**
+ * @brief Function that populates descriptors for either individual streams or chunks of column
+ * data, but not both.
+ *
+ * This function is firstly used in the global step, to gather information for streams of all
+ * stripes in the data sources (when `stream_info` is present). Later on, it is used again to
+ * populate column descriptors (`chunks` is present) during decompression and decoding. The two
+ * steps share most of the execution path thus this function takes mutually exclusive parameters
+ * `stream_info` or `chunks` depending on each use case.
+ *
+ * @param stripe_id The index of the current stripe, can be global index or local decoding index
+ * @param level The current processing nested level
+ * @param stripeinfo The pointer to current stripe's information
+ * @param stripefooter The pointer to current stripe's footer
+ * @param orc2gdf The mapping from ORC column ids to gdf column ids
+ * @param types The schema type
+ * @param use_index Whether to use the row index for parsing
+ * @param apply_struct_map Indicating if this is the root level
+ * @param num_dictionary_entries The number of dictionary entries
+ * @param local_stream_order For retrieving 0-based orders of streams in the decoding step
+ * @param stream_info The vector of streams' information
+ * @param chunks The vector of column descriptors
+ * @return The number of bytes in the gathered streams
+ */
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_decode.cu
similarity index 56%
rename from cpp/src/io/orc/reader_impl_preprocess.cu
rename to cpp/src/io/orc/reader_impl_decode.cu
index 6c59f83bc46..da9fb802a0a 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -16,17 +16,17 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
-#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -34,6 +34,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -44,175 +45,104 @@
 #include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
+#include <numeric>
 
 namespace cudf::io::orc::detail {
 
 namespace {
 
 /**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               int64_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      // Ignore reading this stream from source.
-      cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored.");
-      src_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    } else if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
+ * @brief  Decompresses the stripe data, at stream granularity.
+ *
+ * Only the streams in the provided `stream_range` are decoded. That range is determined in
+ * the previous steps, after splitting stripes into ranges to maintain memory usage to be
+ * under data read limit.
  *
+ * @param loaded_stripe_range Range of stripes that are already loaded in memory
+ * @param stream_range Range of streams to be decoded
+ * @param num_decode_stripes Number of stripes that the decoding streams belong to
+ * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
+ * @return Device buffer to decompressed data
  */
 rmm::device_buffer decompress_stripe_data(
+  range const& loaded_stripe_range,
+  range const& stream_range,
+  std::size_t num_decode_stripes,
+  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
+  host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
+  // Whether we have the comppression info precomputed.
+  auto const compinfo_ready = not compinfo_map.empty();
 
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+  for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+    auto const& info = stream_info[stream_idx];
+
+    auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
+    stream_comp_info       = gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+        info.dst_pos,
+      info.length);
+
+    if (compinfo_ready) {
+      auto const& cached_comp_info             = compinfo_map.at(info.source);
+      stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+      stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+      stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+      num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+      num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+      total_decomp_size += cached_comp_info.total_decomp_size;
+    }
   }
+
+  if (!compinfo_ready) {
+    compinfo.host_to_device_async(stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   stream);
+    compinfo.device_to_host_sync(stream);
+
+    for (std::size_t i = 0; i < compinfo.size(); ++i) {
+      num_compressed_blocks += compinfo[i].num_compressed_blocks;
+      num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+      total_decomp_size += compinfo[i].max_uncompressed_size;
+    }
+  }
+
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
+  // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  // If total_decomp_size is zero, the input data may be just empty.
+  // This is still a valid input, thus do not be panick.
   if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
@@ -220,7 +150,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                inflate_res.begin(),
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -239,13 +169,13 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
+
   compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
@@ -324,7 +254,7 @@ rmm::device_buffer decompress_stripe_data(
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(std::size_t{0}),
       thrust::make_counting_iterator(inflate_res.size()),
       [results           = inflate_res.begin(),
@@ -350,15 +280,15 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  size_type const num_columns = chunks.size().second;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_decode_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -376,7 +306,7 @@ rmm::device_buffer decompress_stripe_data(
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
                             num_columns,
-                            num_stripes,
+                            num_decode_stripes,
                             row_index_stride,
                             use_base_stride,
                             stream);
@@ -401,7 +331,7 @@ rmm::device_buffer decompress_stripe_data(
 void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                       host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
@@ -423,7 +353,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
       if (child_valid_map_base != nullptr) {
         rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
         // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
+        thrust::copy_if(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(0) + parent_mask_len,
                         dst_idx.begin(),
@@ -437,7 +367,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
         uint32_t* dst_idx_ptr = dst_idx.data();
         // Copy child valid bits from child column to valid indexes, this will merge both child
         // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
+        thrust::for_each(rmm::exec_policy_nosync(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
                          [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
@@ -483,19 +413,20 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void decode_stream_data(std::size_t num_dicts,
+void decode_stream_data(int64_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
-                        table_view const& tz_table,
+                        table_device_view const& d_tz_table,
                         cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -511,7 +442,7 @@ void decode_stream_data(std::size_t num_dicts,
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device_sync(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
@@ -520,16 +451,14 @@ void decode_stream_data(std::size_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           *tz_table_dptr,
+                           d_tz_table,
                            row_groups.size().first,
                            row_index_stride,
                            level,
@@ -556,40 +485,38 @@ void decode_stream_data(std::size_t num_dicts,
  * layer.
  */
 void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
   auto const num_stripes = chunks.size().first;
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    d_prefix_sums_to_update.begin(),
+    d_prefix_sums_to_update.end(),
+    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+      auto const& idx_psums) {
+      auto const col_idx = idx_psums.first;
+      auto const psums   = idx_psums.second;
+      thrust::transform(thrust::seq,
+                        thrust::make_counting_iterator<std::size_t>(0ul),
+                        thrust::make_counting_iterator<std::size_t>(num_stripes),
+                        psums,
+                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+    });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -633,6 +560,7 @@ void aggregate_child_meta(std::size_t level,
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
 
@@ -656,10 +584,19 @@ void aggregate_child_meta(std::size_t level,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
+
+        // The number of rows in child column should not be very large otherwise we will have
+        // size overflow.
+        // If that is the case, we need to set a read limit to reduce number of decoding stripes.
+        CUDF_EXPECTS(num_child_rows[child_col_idx] <=
+                       static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                     "Number of rows in the child column exceeds column size limit.");
+
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
@@ -708,264 +645,290 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
   }
 }
 
+/**
+ * @brief Find the splits of the input table such that each split range of rows has data size less
+ * than a given `size_limit`.
+ *
+ * The parameter `segment_length` is to control the granularity of splits. The output ranges will
+ * always have numbers of rows that are multiple of this value, except the last range that contains
+ * the remaining rows.
+ *
+ * Similar to `find_splits`, the given limit is just a soft limit. This function will never output
+ * empty ranges, even they have sizes exceed the value of `size_limit`.
+ *
+ * @param input The input table to find splits
+ * @param segment_length Value to control granularity of the output ranges
+ * @param size_limit A limit on the output size of each split range
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A vector of ranges as splits of the input
+ */
+std::vector<range> find_table_splits(table_view const& input,
+                                     size_type segment_length,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  if (size_limit == 0) {
+    return std::vector<range>{range{0, static_cast<std::size_t>(input.num_rows())}};
+  }
+
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument);
+
+  // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
+  segment_length = std::min(segment_length, input.num_rows());
+
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  thrust::transform(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
+      return cumulative_size{static_cast<std::size_t>(current_length),
+                             static_cast<std::size_t>(size)};
+    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_plus{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);
+}
+
 }  // namespace
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
+void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  _file_itm_data = std::make_unique<file_intermediate_data>();
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+
+  auto const stripe_range =
+    _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
+  auto const stripe_start = stripe_range.begin;
+  auto const stripe_end   = stripe_range.end;
+  auto const stripe_count = stripe_range.size();
+
+  // The start index of loaded stripes. They are different from decoding stripes.
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_start = load_stripe_range.begin;
+
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // Number of rows to decode in this decompressing/decoding step.
+  int64_t rows_to_decode = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
+    rows_to_decode += stripe_rows;
+  }
 
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
+  CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
+  rows_to_decode = std::min<int64_t>(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read);
+
+  // After this step, we no longer have any rows to skip.
+  // The number of rows remains to read in the future also reduced.
+  _file_itm_data.rows_to_skip = 0;
+  _file_itm_data.rows_to_read -= rows_to_decode;
+
+  // Technically, overflow here should never happen because the `load_next_stripe_data()` step
+  // already handled it by splitting the loaded stripe range into multiple decode ranges.
+  CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Number or rows to decode exceeds the column size limit.",
+               std::overflow_error);
+
+  auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream);
+  auto const num_levels    = _selected_columns.num_levels();
+  _out_buffers.resize(num_levels);
+
+  // Column descriptors ('chunks').
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  // Unfortunately we cannot create one hostdevice_vector to use for all levels because
+  // currently we do not have a hostdevice_2dspan class.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+
+  // For computing null count.
+  auto null_count_prefix_sums = [&] {
+    auto const num_total_cols = std::accumulate(
+      _selected_columns.levels.begin(),
+      _selected_columns.levels.end(),
+      std::size_t{0},
+      [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
+
+    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+  }();
+  std::size_t num_processed_lvl_columns      = 0;
+  std::size_t num_processed_prev_lvl_columns = 0;
+
+  // For parsing decompression data.
+  // We create one hostdevice_vector that is large enough to use for all levels,
+  // thus only need to allocate memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the decoding stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
+    auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
+    auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
 
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
+    auto const& columns_level = _selected_columns.levels[level];
+    auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
+    auto const& column_types  = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
+
+    auto const num_lvl_columns = columns_level.size();
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _use_index &&
+      _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * total_num_stripes < 8 * 128) &&
+      (rows_to_decode > _metadata.get_row_index_stride() &&
+       !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
+       num_lvl_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
-    size_type stripe_idx     = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
+    // 0-based counters, used across all decoding stripes in this step.
+    int64_t stripe_start_row{0};
+    int64_t num_dict_entries{0};
+    uint32_t num_rowgroups{0};
+    std::size_t local_stream_order{0};
+
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      auto const& stripe       = selected_stripes[stripe_idx];
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      // Normalize stripe_idx to 0-based.
+      auto const stripe_local_idx = stripe_idx - stripe_start;
+
+      // The first parameter (`stripe_order`) must be normalized to 0-based.
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx,
+                                                                      level,
+                                                                      stripe_info,
+                                                                      stripe_footer,
+                                                                      col_meta.orc_col_map[level],
+                                                                      _metadata.get_types(),
+                                                                      use_index,
+                                                                      level == 0,
+                                                                      &num_dict_entries,
+                                                                      &local_stream_order,
+                                                                      nullptr,  // stream_info
+                                                                      &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+      auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+
+      uint32_t const rowgroup_id = num_rowgroups;
+      uint32_t const stripe_num_rowgroups =
+        use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) /
+                      _metadata.get_row_index_stride()
+                  : 0;
+
+      // Update chunks to reference streams pointers.
+      for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) {
+        auto& chunk = chunks[stripe_local_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row =
+          (level == 0) ? stripe_start_row
+                       : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? num_rows_in_stripe
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0) ? nullptr
+                       : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns +
+                                                          col_meta.parent_column_index[col_idx]) *
+                                                           stripe_count;
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+
+        if (chunk.type_kind == orc::TIMESTAMP) {
+          chunk.timestamp_type_id = _options.timestamp_type.id();
         }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] =
+              dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
         }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
       }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+
+      stripe_start_row += num_rows_in_stripe;
+      num_rowgroups += stripe_num_rowgroups;
     }
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
+    // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
+                                                     num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -979,19 +942,31 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        return meta;
                      });
     }
-    // Setup row group descriptors if using indexes
+
+    // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                compinfo,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,
                                                 row_groups,
-                                                total_num_stripes,
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
+
+      // Just save the decompressed data and clear out the raw data to free up memory.
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (std::size_t i = 1; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1000,34 +975,38 @@ void reader::impl::prepare_data(int64_t skip_rows,
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
+                                num_lvl_columns,
+                                stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
                                 _stream);
       }
     }
 
+    _out_buffers[level].resize(0);
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+      for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      auto const is_list_type = (column_types[i].id() == type_id::LIST);
+      auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
+
+      // For list column, offset column will be always size + 1.
+      _out_buffers[level].emplace_back(
+        column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
     }
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
                        level,
-                       tz_table->view(),
+                       *tz_table_dptr,
                        chunks,
                        row_groups,
                        _out_buffers[level],
@@ -1035,8 +1014,9 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        _mr);
 
     if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      // Extract information to process nested child columns.
+      scan_null_counts(
+        chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream);
 
       row_groups.device_to_host_sync(_stream);
       aggregate_child_meta(
@@ -1054,7 +1034,48 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+    num_processed_prev_lvl_columns = num_processed_lvl_columns;
+    num_processed_lvl_columns += num_lvl_columns;
   }  // end loop level
+
+  // Now generate a table from the decoded result.
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // Free up temp memory used for decoding.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].resize(0);
+
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
+  }
+
+  // Output table range is reset to start from the first position.
+  _chunk_read_data.curr_output_table_range = 0;
+
+  // Split the decoded table into ranges that be output into chunks having size within the given
+  // output size limit.
+  _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.chunk_read_limit,
+                                                           _stream);
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index ea4e5dcfaab..c943ae17d97 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,6 +16,8 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
@@ -111,7 +113,7 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const col_id = col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 22482bad486..a563fb19e15 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
-#include "orc.hpp"
 
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -147,6 +148,6 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index dd44b779402..89dbbcb796c 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         uint32_t log2maxcr,
                                         rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    strm_info, num_streams, compression_block_size, log2maxcr);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+      strm_info, num_streams, compression_block_size, log2maxcr);
+  }
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
-                                                                             num_streams);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+                                                                               num_streams);
+  }
 }
 
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ade0e75de35..344e216cdc8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -782,8 +782,16 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
         } else {
           // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded
           // Use the number of set bits in pushdown mask as size
-          auto bits_to_borrow =
-            8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8;
+          auto bits_to_borrow = [&]() {
+            auto const parent_valid_count = d_pd_set_counts[rg_idx][parent_col_idx];
+            if (parent_valid_count < previously_borrowed) {
+              // Borrow to make an empty rowgroup
+              return previously_borrowed - parent_valid_count;
+            }
+            auto const misalignment = (parent_valid_count - previously_borrowed) % 8;
+            return (8 - misalignment) % 8;
+          }();
+
           if (bits_to_borrow == 0) {
             // Didn't borrow any bits for this rowgroup
             previously_borrowed = 0;
@@ -2438,7 +2446,6 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
@@ -2460,20 +2467,13 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::~impl() { close(); }
 
-void writer::impl::init_state()
-{
-  // Write file header
-  _out_sink->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void writer::impl::write(table_view const& input)
 {
-  CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
+  CUDF_EXPECTS(_state != writer_state::CLOSED, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = make_table_meta(input); }
 
@@ -2516,6 +2516,11 @@ void writer::impl::write(table_view const& input)
     }
   }();
 
+  if (_state == writer_state::NO_DATA_WRITTEN) {
+    // Write the ORC file header if this is the first write
+    _out_sink->host_write(MAGIC, std::strlen(MAGIC));
+  }
+
   // Compression/encoding were all successful. Now write the intermediate results.
   write_orc_data_to_sink(enc_data,
                          segmentation,
@@ -2533,6 +2538,8 @@ void writer::impl::write(table_view const& input)
 
   // Update file-level and compression statistics
   update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats);
+
+  _state = writer_state::DATA_WRITTEN;
 }
 
 void writer::impl::update_statistics(
@@ -2683,8 +2690,11 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
 
 void writer::impl::close()
 {
-  if (_closed) { return; }
-  _closed = true;
+  if (_state != writer_state::DATA_WRITTEN) {
+    // writer is either closed or no data has been written
+    _state = writer_state::CLOSED;
+    return;
+  }
   PostScript ps;
 
   if (_stats_freq != statistics_freq::STATISTICS_NONE) {
@@ -2769,6 +2779,8 @@ void writer::impl::close()
   pbw.put_byte(ps_length);
   _out_sink->host_write(pbw.data(), pbw.size());
   _out_sink->flush();
+
+  _state = writer_state::CLOSED;
 }
 
 // Forward to implementation
@@ -2795,9 +2807,6 @@ writer::~writer() = default;
 // Forward to implementation
 void writer::write(table_view const& table) { _impl->write(table); }
 
-// Forward to implementation
-void writer::skip_close() { _impl->skip_close(); }
-
 // Forward to implementation
 void writer::close() { _impl->close(); }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 417d29efb58..bd082befe0c 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -227,6 +227,14 @@ struct encoded_footer_statistics {
   std::vector<ColStatsBlob> file_level;
 };
 
+enum class writer_state {
+  NO_DATA_WRITTEN,  // No table data has been written to the sink; if the writer is closed or
+                    // destroyed in this state, it should not write the footer.
+  DATA_WRITTEN,     // At least one table has been written to the sink; when the writer is closed,
+                    // it should write the footer.
+  CLOSED            // Writer has been closed; no further writes are allowed.
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -266,11 +274,6 @@ class writer::impl {
    */
   ~impl();
 
-  /**
-   * @brief Begins the chunked/streamed write process.
-   */
-  void init_state();
-
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
@@ -283,11 +286,6 @@ class writer::impl {
    */
   void close();
 
-  /**
-   * @brief Skip writing the footer when closing/deleting the writer.
-   */
-  void skip_close() { _closed = true; }
-
  private:
   /**
    * @brief Write the intermediate ORC data into the data sink.
@@ -363,7 +361,7 @@ class writer::impl {
   Footer _footer;
   Metadata _orc_meta;
   persisted_statistics _persisted_stripe_statistics;  // Statistics data saved between calls.
-  bool _closed = false;  // To track if the output has been written to sink.
+  writer_state _state = writer_state::NO_DATA_WRITTEN;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index d39d832c18c..c9212334a96 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -16,6 +16,9 @@
 
 #include "compact_protocol_reader.hpp"
 
+#include "parquet.hpp"
+#include "parquet_common.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
@@ -171,6 +174,7 @@ class parquet_field_int : public parquet_field {
 };
 
 using parquet_field_int8  = parquet_field_int<int8_t, FieldType::I8>;
+using parquet_field_int16 = parquet_field_int<int16_t, FieldType::I16>;
 using parquet_field_int32 = parquet_field_int<int32_t, FieldType::I32>;
 using parquet_field_int64 = parquet_field_int<int64_t, FieldType::I64>;
 
@@ -618,9 +622,18 @@ void CompactProtocolReader::read(IntType* i)
 
 void CompactProtocolReader::read(RowGroup* r)
 {
+  using optional_i16 = parquet_field_optional<int16_t, parquet_field_int16>;
+  using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_list_sorting_column =
+    parquet_field_optional<std::vector<SortingColumn>, parquet_field_struct_list<SortingColumn>>;
+
   auto op = std::make_tuple(parquet_field_struct_list(1, r->columns),
                             parquet_field_int64(2, r->total_byte_size),
-                            parquet_field_int64(3, r->num_rows));
+                            parquet_field_int64(3, r->num_rows),
+                            optional_list_sorting_column(4, r->sorting_columns),
+                            optional_i64(5, r->file_offset),
+                            optional_i64(6, r->total_compressed_size),
+                            optional_i16(7, r->ordinal));
   function_builder(this, op);
 }
 
@@ -640,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   using optional_size_statistics =
     parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
+  using optional_list_enc_stats =
+    parquet_field_optional<std::vector<PageEncodingStats>,
+                           parquet_field_struct_list<PageEncodingStats>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
@@ -651,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(10, c->index_page_offset),
                             parquet_field_int64(11, c->dictionary_page_offset),
                             parquet_field_struct(12, c->statistics),
+                            optional_list_enc_stats(13, c->encoding_stats),
                             optional_size_statistics(16, c->size_statistics));
   function_builder(this, op);
 }
@@ -746,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s)
 {
   using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
   using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_bool   = parquet_field_optional<bool, parquet_field_bool>;
 
   auto op = std::make_tuple(optional_binary(1, s->max),
                             optional_binary(2, s->min),
                             optional_int64(3, s->null_count),
                             optional_int64(4, s->distinct_count),
                             optional_binary(5, s->max_value),
-                            optional_binary(6, s->min_value));
+                            optional_binary(6, s->min_value),
+                            optional_bool(7, s->is_max_value_exact),
+                            optional_bool(8, s->is_min_value_exact));
   function_builder(this, op);
 }
 
@@ -762,6 +782,22 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(PageEncodingStats* s)
+{
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, s->page_type),
+                            parquet_field_enum<Encoding>(2, s->encoding),
+                            parquet_field_int32(3, s->count));
+  function_builder(this, op);
+}
+
+void CompactProtocolReader::read(SortingColumn* s)
+{
+  auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
+                            parquet_field_bool(2, s->descending),
+                            parquet_field_bool(3, s->nulls_first));
+  function_builder(this, op);
+}
+
 /**
  * @brief Constructs the schema from the file-level metadata
  *
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index f244df07176..bcc9adfc8c0 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,8 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(PageEncodingStats* s);
+  void read(SortingColumn* s);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index d610ec6c546..14c99f728de 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_writer.hpp"
 
+#include "parquet.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 namespace cudf::io::parquet::detail {
@@ -140,6 +142,10 @@ size_t CompactProtocolWriter::write(RowGroup const& r)
   c.field_struct_list(1, r.columns);
   c.field_int(2, r.total_byte_size);
   c.field_int(3, r.num_rows);
+  if (r.sorting_columns.has_value()) { c.field_struct_list(4, r.sorting_columns.value()); }
+  if (r.file_offset.has_value()) { c.field_int(5, r.file_offset.value()); }
+  if (r.total_compressed_size.has_value()) { c.field_int(6, r.total_compressed_size.value()); }
+  if (r.ordinal.has_value()) { c.field_int16(7, r.ordinal.value()); }
   return c.value();
 }
 
@@ -182,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
   c.field_struct(12, s.statistics);
+  if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); }
   if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
   return c.value();
 }
@@ -195,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s)
   if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
   if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
   if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
+  if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); }
+  if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); }
   return c.value();
 }
 
@@ -242,6 +251,24 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(PageEncodingStats const& enc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, static_cast<int32_t>(enc.page_type));
+  c.field_int(2, static_cast<int32_t>(enc.encoding));
+  c.field_int(3, enc.count);
+  return c.value();
+}
+
+size_t CompactProtocolWriter::write(SortingColumn const& sc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, sc.column_idx);
+  c.field_bool(2, sc.descending);
+  c.field_bool(3, sc.nulls_first);
+  return c.value();
+}
+
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
 void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
@@ -292,6 +319,13 @@ inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val)
   current_field_value = field;
 }
 
+inline void CompactProtocolFieldWriter::field_int16(int field, int16_t val)
+{
+  put_field_header(field, current_field_value, FieldType::I16);
+  put_int(val);
+  current_field_value = field;
+}
+
 inline void CompactProtocolFieldWriter::field_int(int field, int32_t val)
 {
   put_field_header(field, current_field_value, FieldType::I32);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2ed7c078f8b..c2e6178acbf 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,8 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(PageEncodingStats const&);
+  size_t write(SortingColumn const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -91,6 +93,8 @@ class CompactProtocolFieldWriter {
 
   inline void field_int8(int field, int8_t val);
 
+  inline void field_int16(int field, int16_t val);
+
   inline void field_int(int field, int32_t val);
 
   inline void field_int(int field, int64_t val);
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 062363db503..bfd89200786 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -31,12 +31,8 @@ constexpr int rolling_buf_size  = decode_block_size * 2;
 constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
 
 template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_value_count,
-                                                                page_state_s* s,
-                                                                state_buf* sb,
-                                                                level_t const* const def,
-                                                                int t,
-                                                                bool nullable_with_nulls)
+static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
@@ -63,13 +59,9 @@ static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_v
     // definition level. only need to process for nullable columns
     int d = 0;
     if constexpr (nullable) {
-      if (nullable_with_nulls) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+      d = t < batch_size
+            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+            : -1;
     }
 
     int const thread_value_count = t + 1;
@@ -165,7 +157,7 @@ __device__ inline void gpuDecodeValues(
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.data_type & 7;
+  int const dtype                          = s->col.physical_type;
 
   // decode values
   int pos = start;
@@ -187,7 +179,7 @@ __device__ inline void gpuDecodeValues(
       uint32_t dtype_len = s->dtype_len;
       void* dst =
         nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
-      if (s->col.converted_type == DECIMAL) {
+      if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
           case INT64: gpuOutputFast(s, sb, src_pos, static_cast<uint2*>(dst)); break;
@@ -225,6 +217,96 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
+template <typename state_buf>
+__device__ inline void gpuDecodeSplitValues(page_state_s* s,
+                                            state_buf* const sb,
+                                            int start,
+                                            int end)
+{
+  using cudf::detail::warp_size;
+  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int max_batch_size = num_warps * warp_size;
+
+  auto const t = threadIdx.x;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  int const dtype                          = s->col.physical_type;
+  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
+  auto const num_values                    = data_len / s->dtype_len_in;
+
+  // decode values
+  int pos = start;
+  while (pos < end) {
+    int const batch_size = min(max_batch_size, end - pos);
+
+    int const target_pos = pos + batch_size;
+    int const src_pos    = pos + t;
+
+    // the position in the output column/buffer
+    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+
+    // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+    // before first_row) in the flat hierarchy case.
+    if (src_pos < target_pos && dst_pos >= 0) {
+      // nesting level that is storing actual leaf values
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+      uint32_t dtype_len = s->dtype_len;
+      uint8_t const* src = s->data_start + src_pos;
+      uint8_t* dst =
+        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      auto const is_decimal =
+        s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+      // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+      if (is_decimal) {
+        switch (dtype) {
+          case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+          case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+          case FIXED_LEN_BYTE_ARRAY:
+            if (s->dtype_len_in <= sizeof(int32_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(int64_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            }
+            // unsupported decimal precision
+            [[fallthrough]];
+
+          default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      } else if (dtype_len == 8) {
+        if (s->dtype_len_in == 4) {
+          // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+          // TIME_MILLIS is the only duration type stored as int32:
+          // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+          // zero out most significant bytes
+          memset(dst + 4, 0, 4);
+        } else if (s->ts_scale) {
+          gpuOutputSplitInt64Timestamp(
+            reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+        } else {
+          gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+        }
+      } else if (dtype_len == 4) {
+        gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+      } else {
+        s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+      }
+    }
+
+    pos += batch_size;
+  }
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -336,17 +418,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -354,7 +432,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -457,18 +535,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
       // count of valid items in this batch
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -476,7 +550,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -495,6 +569,119 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+/**
+ * @brief Kernel for computing fixed width non dictionary column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk. If necessary, additional
+ * conversion will be performed to translate from the Parquet datatype to
+ * desired output datatype.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageDataFlat(PageInfo* pages,
+                             device_span<ColumnChunkDesc const> chunks,
+                             size_t min_row,
+                             size_t num_rows,
+                             kernel_error::pointer error_code)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                                1,                 // unused in this kernel
+                                                1>                 // unused in this kernel
+    state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
+    return;
+  }
+
+  // must come after the kernel mask check
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          pp,
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  if (s->num_rows == 0) { return; }
+
+  bool const nullable            = is_nullable(s);
+  bool const nullable_with_nulls = nullable && has_nulls(s);
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  if (nullable_with_nulls) {
+    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
+                     s->abs_lvl_start[level_type::DEFINITION],
+                     s->abs_lvl_end[level_type::DEFINITION],
+                     def,
+                     s->page.num_input_values);
+  }
+  __syncthreads();
+
+  // We use two counters in the loop below: processed_count and valid_count.
+  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  //   the definition stream returns the number of total rows it has processed in each call
+  //   to decode_next and we accumulate in process_count.
+  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  //   loop below, we look at the number of valid items (which could be all for non-nullable),
+  //   and valid_count is that running count.
+  int processed_count = 0;
+  int valid_count     = 0;
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuDecodeValues
+  while (s->error == 0 && processed_count < s->page.num_input_values) {
+    int next_valid_count;
+
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
+
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+    }
+    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
+    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
+    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    else {
+      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
+        processed_count, s, sb, nullptr, t);
+    }
+    __syncthreads();
+
+    // decode the values themselves
+    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    __syncthreads();
+
+    valid_count = next_valid_count;
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 }  // anonymous namespace
 
 void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
@@ -528,7 +715,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
   // 1 full warp, and 1 warp of 1 thread
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
-  dim3 dim_grid(pages.size(), 1);        // 1 thread block per pags => # blocks
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
     gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -539,4 +726,24 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   }
 }
 
+void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                      size_t num_rows,
+                                      size_t min_row,
+                                      int level_type_size,
+                                      kernel_error::pointer error_code,
+                                      rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8f772636c7e..e49801e6172 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -389,7 +389,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
   // containing lists anywhere within).
   compute_string_sizes =
-    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
+    compute_string_sizes && s->col.physical_type == BYTE_ARRAY && !s->col.is_strings_to_cat;
 
   // early out optimizations:
 
diff --git a/cpp/src/io/parquet/ipc/Message_generated.h b/cpp/src/io/parquet/ipc/Message_generated.h
new file mode 100644
index 00000000000..8ddd859f51c
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Message_generated.h
@@ -0,0 +1,651 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+#include "Schema_generated.h"
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct FieldNode;
+
+struct BodyCompression;
+struct BodyCompressionBuilder;
+
+struct RecordBatch;
+struct RecordBatchBuilder;
+
+struct DictionaryBatch;
+struct DictionaryBatchBuilder;
+
+struct Message;
+struct MessageBuilder;
+
+enum CompressionType : int8_t {
+  CompressionType_LZ4_FRAME = 0,
+  CompressionType_ZSTD      = 1,
+  CompressionType_MIN       = CompressionType_LZ4_FRAME,
+  CompressionType_MAX       = CompressionType_ZSTD
+};
+
+inline const CompressionType (&EnumValuesCompressionType())[2]
+{
+  static const CompressionType values[] = {CompressionType_LZ4_FRAME, CompressionType_ZSTD};
+  return values;
+}
+
+inline const char* const* EnumNamesCompressionType()
+{
+  static const char* const names[3] = {"LZ4_FRAME", "ZSTD", nullptr};
+  return names;
+}
+
+inline const char* EnumNameCompressionType(CompressionType e)
+{
+  if (::flatbuffers::IsOutRange(e, CompressionType_LZ4_FRAME, CompressionType_ZSTD)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCompressionType()[index];
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod : int8_t {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BodyCompressionMethod_BUFFER = 0,
+  BodyCompressionMethod_MIN    = BodyCompressionMethod_BUFFER,
+  BodyCompressionMethod_MAX    = BodyCompressionMethod_BUFFER
+};
+
+inline const BodyCompressionMethod (&EnumValuesBodyCompressionMethod())[1]
+{
+  static const BodyCompressionMethod values[] = {BodyCompressionMethod_BUFFER};
+  return values;
+}
+
+inline const char* const* EnumNamesBodyCompressionMethod()
+{
+  static const char* const names[2] = {"BUFFER", nullptr};
+  return names;
+}
+
+inline const char* EnumNameBodyCompressionMethod(BodyCompressionMethod e)
+{
+  if (::flatbuffers::IsOutRange(e, BodyCompressionMethod_BUFFER, BodyCompressionMethod_BUFFER))
+    return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBodyCompressionMethod()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+enum MessageHeader : uint8_t {
+  MessageHeader_NONE   = 0,
+  MessageHeader_Schema = 1,
+  MessageHeader_MIN    = MessageHeader_NONE,
+  MessageHeader_MAX    = MessageHeader_Schema
+};
+
+inline const MessageHeader (&EnumValuesMessageHeader())[2]
+{
+  static const MessageHeader values[] = {MessageHeader_NONE, MessageHeader_Schema};
+  return values;
+}
+
+inline const char* const* EnumNamesMessageHeader()
+{
+  static const char* const names[3] = {"NONE", "Schema", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMessageHeader(MessageHeader e)
+{
+  if (::flatbuffers::IsOutRange(e, MessageHeader_NONE, MessageHeader_Schema)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMessageHeader()[index];
+}
+
+template <typename T>
+struct MessageHeaderTraits {
+  static const MessageHeader enum_value = MessageHeader_NONE;
+};
+
+template <>
+struct MessageHeaderTraits<cudf::io::parquet::flatbuf::Schema> {
+  static const MessageHeader enum_value = MessageHeader_Schema;
+};
+
+bool VerifyMessageHeader(::flatbuffers::Verifier& verifier, const void* obj, MessageHeader type);
+bool VerifyMessageHeaderVector(::flatbuffers::Verifier& verifier,
+                               const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                               const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) FieldNode FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t length_;
+  int64_t null_count_;
+
+ public:
+  FieldNode() : length_(0), null_count_(0) {}
+  FieldNode(int64_t _length, int64_t _null_count)
+    : length_(::flatbuffers::EndianScalar(_length)),
+      null_count_(::flatbuffers::EndianScalar(_null_count))
+  {
+  }
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  int64_t null_count() const { return ::flatbuffers::EndianScalar(null_count_); }
+};
+FLATBUFFERS_STRUCT_END(FieldNode, 16);
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+struct BodyCompression FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BodyCompressionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_CODEC = 4, VT_METHOD = 6 };
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  cudf::io::parquet::flatbuf::CompressionType codec() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::CompressionType>(GetField<int8_t>(VT_CODEC, 0));
+  }
+  /// Indicates the way the record batch body was compressed
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::BodyCompressionMethod>(
+      GetField<int8_t>(VT_METHOD, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int8_t>(verifier, VT_CODEC, 1) &&
+           VerifyField<int8_t>(verifier, VT_METHOD, 1) && verifier.EndTable();
+  }
+};
+
+struct BodyCompressionBuilder {
+  typedef BodyCompression Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_codec(cudf::io::parquet::flatbuf::CompressionType codec)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_CODEC, static_cast<int8_t>(codec), 0);
+  }
+  void add_method(cudf::io::parquet::flatbuf::BodyCompressionMethod method)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_METHOD, static_cast<int8_t>(method), 0);
+  }
+  explicit BodyCompressionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BodyCompression> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BodyCompression>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BodyCompression> CreateBodyCompression(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::CompressionType codec =
+    cudf::io::parquet::flatbuf::CompressionType_LZ4_FRAME,
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method =
+    cudf::io::parquet::flatbuf::BodyCompressionMethod_BUFFER)
+{
+  BodyCompressionBuilder builder_(_fbb);
+  builder_.add_method(method);
+  builder_.add_codec(codec);
+  return builder_.Finish();
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+struct RecordBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RecordBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LENGTH               = 4,
+    VT_NODES                = 6,
+    VT_BUFFERS              = 8,
+    VT_COMPRESSION          = 10,
+    VT_VARIADICBUFFERCOUNTS = 12
+  };
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  int64_t length() const { return GetField<int64_t>(VT_LENGTH, 0); }
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>* nodes() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>*>(
+      VT_NODES);
+  }
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>* buffers() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>*>(
+      VT_BUFFERS);
+  }
+  /// Optional compression of the message body
+  const cudf::io::parquet::flatbuf::BodyCompression* compression() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::BodyCompression*>(VT_COMPRESSION);
+  }
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  const ::flatbuffers::Vector<int64_t>* variadicBufferCounts() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_VARIADICBUFFERCOUNTS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) &&
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.VerifyVector(buffers()) &&
+           VerifyOffset(verifier, VT_COMPRESSION) && verifier.VerifyTable(compression()) &&
+           VerifyOffset(verifier, VT_VARIADICBUFFERCOUNTS) &&
+           verifier.VerifyVector(variadicBufferCounts()) && verifier.EndTable();
+  }
+};
+
+struct RecordBatchBuilder {
+  typedef RecordBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_length(int64_t length) { fbb_.AddElement<int64_t>(RecordBatch::VT_LENGTH, length, 0); }
+  void add_nodes(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>>
+      nodes)
+  {
+    fbb_.AddOffset(RecordBatch::VT_NODES, nodes);
+  }
+  void add_buffers(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers)
+  {
+    fbb_.AddOffset(RecordBatch::VT_BUFFERS, buffers);
+  }
+  void add_compression(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression)
+  {
+    fbb_.AddOffset(RecordBatch::VT_COMPRESSION, compression);
+  }
+  void add_variadicBufferCounts(
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts)
+  {
+    fbb_.AddOffset(RecordBatch::VT_VARIADICBUFFERCOUNTS, variadicBufferCounts);
+  }
+  explicit RecordBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RecordBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RecordBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>> nodes =
+    0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers =
+    0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts     = 0)
+{
+  RecordBatchBuilder builder_(_fbb);
+  builder_.add_length(length);
+  builder_.add_variadicBufferCounts(variadicBufferCounts);
+  builder_.add_compression(compression);
+  builder_.add_buffers(buffers);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatchDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length                                                                 = 0,
+  const std::vector<cudf::io::parquet::flatbuf::FieldNode>* nodes                = nullptr,
+  const std::vector<cudf::io::parquet::flatbuf::Buffer>* buffers                 = nullptr,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  const std::vector<int64_t>* variadicBufferCounts                               = nullptr)
+{
+  auto nodes__ =
+    nodes ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::FieldNode>(*nodes) : 0;
+  auto buffers__ =
+    buffers ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::Buffer>(*buffers) : 0;
+  auto variadicBufferCounts__ =
+    variadicBufferCounts ? _fbb.CreateVector<int64_t>(*variadicBufferCounts) : 0;
+  return cudf::io::parquet::flatbuf::CreateRecordBatch(
+    _fbb, length, nodes__, buffers__, compression, variadicBufferCounts__);
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+struct DictionaryBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID      = 4,
+    VT_DATA    = 6,
+    VT_ISDELTA = 8
+  };
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  const cudf::io::parquet::flatbuf::RecordBatch* data() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::RecordBatch*>(VT_DATA);
+  }
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  bool isDelta() const { return GetField<uint8_t>(VT_ISDELTA, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_DATA) && verifier.VerifyTable(data()) &&
+           VerifyField<uint8_t>(verifier, VT_ISDELTA, 1) && verifier.EndTable();
+  }
+};
+
+struct DictionaryBatchBuilder {
+  typedef DictionaryBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryBatch::VT_ID, id, 0); }
+  void add_data(::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data)
+  {
+    fbb_.AddOffset(DictionaryBatch::VT_DATA, data);
+  }
+  void add_isDelta(bool isDelta)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryBatch::VT_ISDELTA, static_cast<uint8_t>(isDelta), 0);
+  }
+  explicit DictionaryBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryBatch> CreateDictionaryBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                          = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data = 0,
+  bool isDelta                                                        = false)
+{
+  DictionaryBatchBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_data(data);
+  builder_.add_isDelta(isDelta);
+  return builder_.Finish();
+}
+
+struct Message FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MessageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION         = 4,
+    VT_HEADER_TYPE     = 6,
+    VT_HEADER          = 8,
+    VT_BODYLENGTH      = 10,
+    VT_CUSTOM_METADATA = 12
+  };
+  cudf::io::parquet::flatbuf::MetadataVersion version() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MetadataVersion>(
+      GetField<int16_t>(VT_VERSION, 0));
+  }
+  cudf::io::parquet::flatbuf::MessageHeader header_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MessageHeader>(
+      GetField<uint8_t>(VT_HEADER_TYPE, 0));
+  }
+  const void* header() const { return GetPointer<const void*>(VT_HEADER); }
+  template <typename T>
+  const T* header_as() const;
+  const cudf::io::parquet::flatbuf::Schema* header_as_Schema() const
+  {
+    return header_type() == cudf::io::parquet::flatbuf::MessageHeader_Schema
+             ? static_cast<const cudf::io::parquet::flatbuf::Schema*>(header())
+             : nullptr;
+  }
+  int64_t bodyLength() const { return GetField<int64_t>(VT_BODYLENGTH, 0); }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_VERSION, 2) &&
+           VerifyField<uint8_t>(verifier, VT_HEADER_TYPE, 1) && VerifyOffset(verifier, VT_HEADER) &&
+           VerifyMessageHeader(verifier, header(), header_type()) &&
+           VerifyField<int64_t>(verifier, VT_BODYLENGTH, 8) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Schema*
+Message::header_as<cudf::io::parquet::flatbuf::Schema>() const
+{
+  return header_as_Schema();
+}
+
+struct MessageBuilder {
+  typedef Message Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_version(cudf::io::parquet::flatbuf::MetadataVersion version)
+  {
+    fbb_.AddElement<int16_t>(Message::VT_VERSION, static_cast<int16_t>(version), 0);
+  }
+  void add_header_type(cudf::io::parquet::flatbuf::MessageHeader header_type)
+  {
+    fbb_.AddElement<uint8_t>(Message::VT_HEADER_TYPE, static_cast<uint8_t>(header_type), 0);
+  }
+  void add_header(::flatbuffers::Offset<void> header)
+  {
+    fbb_.AddOffset(Message::VT_HEADER, header);
+  }
+  void add_bodyLength(int64_t bodyLength)
+  {
+    fbb_.AddElement<int64_t>(Message::VT_BODYLENGTH, bodyLength, 0);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Message::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit MessageBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Message> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Message>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Message> CreateMessage(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header                                              = 0,
+  int64_t bodyLength                                                              = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata = 0)
+{
+  MessageBuilder builder_(_fbb);
+  builder_.add_bodyLength(bodyLength);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_header(header);
+  builder_.add_version(version);
+  builder_.add_header_type(header_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Message> CreateMessageDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header = 0,
+  int64_t bodyLength                 = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateMessage(
+    _fbb, version, header_type, header, bodyLength, custom_metadata__);
+}
+
+inline bool VerifyMessageHeader(::flatbuffers::Verifier& verifier,
+                                const void* obj,
+                                MessageHeader type)
+{
+  switch (type) {
+    case MessageHeader_NONE: {
+      return true;
+    }
+    case MessageHeader_Schema: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Schema*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyMessageHeaderVector(
+  ::flatbuffers::Verifier& verifier,
+  const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+  const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyMessageHeader(verifier, values->Get(i), types->GetEnum<MessageHeader>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetMessage(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetSizePrefixedMessage(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline bool VerifyMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline bool VerifySizePrefixedMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline void FinishMessageBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                                ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMessageBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h
new file mode 100644
index 00000000000..27141b4af31
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Schema_generated.h
@@ -0,0 +1,2769 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct Null;
+struct NullBuilder;
+
+struct Struct_;
+struct Struct_Builder;
+
+struct List;
+struct ListBuilder;
+
+struct LargeList;
+struct LargeListBuilder;
+
+struct ListView;
+struct ListViewBuilder;
+
+struct LargeListView;
+struct LargeListViewBuilder;
+
+struct FixedSizeList;
+struct FixedSizeListBuilder;
+
+struct Map;
+struct MapBuilder;
+
+struct Union;
+struct UnionBuilder;
+
+struct Int;
+struct IntBuilder;
+
+struct FloatingPoint;
+struct FloatingPointBuilder;
+
+struct Utf8;
+struct Utf8Builder;
+
+struct Binary;
+struct BinaryBuilder;
+
+struct LargeUtf8;
+struct LargeUtf8Builder;
+
+struct LargeBinary;
+struct LargeBinaryBuilder;
+
+struct Utf8View;
+struct Utf8ViewBuilder;
+
+struct BinaryView;
+struct BinaryViewBuilder;
+
+struct FixedSizeBinary;
+struct FixedSizeBinaryBuilder;
+
+struct Bool;
+struct BoolBuilder;
+
+struct RunEndEncoded;
+struct RunEndEncodedBuilder;
+
+struct Decimal;
+struct DecimalBuilder;
+
+struct Date;
+struct DateBuilder;
+
+struct Time;
+struct TimeBuilder;
+
+struct Timestamp;
+struct TimestampBuilder;
+
+struct Interval;
+struct IntervalBuilder;
+
+struct Duration;
+struct DurationBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct DictionaryEncoding;
+struct DictionaryEncodingBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Buffer;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum MetadataVersion : int16_t {
+  /// 0.1.0 (October 2016).
+  MetadataVersion_V1 = 0,
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  MetadataVersion_V2 = 1,
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  MetadataVersion_V3 = 2,
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  MetadataVersion_V4 = 3,
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  MetadataVersion_V5  = 4,
+  MetadataVersion_MIN = MetadataVersion_V1,
+  MetadataVersion_MAX = MetadataVersion_V5
+};
+
+inline const MetadataVersion (&EnumValuesMetadataVersion())[5]
+{
+  static const MetadataVersion values[] = {MetadataVersion_V1,
+                                           MetadataVersion_V2,
+                                           MetadataVersion_V3,
+                                           MetadataVersion_V4,
+                                           MetadataVersion_V5};
+  return values;
+}
+
+inline const char* const* EnumNamesMetadataVersion()
+{
+  static const char* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMetadataVersion(MetadataVersion e)
+{
+  if (::flatbuffers::IsOutRange(e, MetadataVersion_V1, MetadataVersion_V5)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMetadataVersion()[index];
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : int64_t {
+  /// Needed to make flatbuffers happy.
+  Feature_UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  Feature_DICTIONARY_REPLACEMENT = 1LL,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  Feature_COMPRESSED_BODY = 2LL,
+  Feature_MIN             = Feature_UNUSED,
+  Feature_MAX             = Feature_COMPRESSED_BODY
+};
+
+inline const Feature (&EnumValuesFeature())[3]
+{
+  static const Feature values[] = {
+    Feature_UNUSED, Feature_DICTIONARY_REPLACEMENT, Feature_COMPRESSED_BODY};
+  return values;
+}
+
+inline const char* const* EnumNamesFeature()
+{
+  static const char* const names[4] = {
+    "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", nullptr};
+  return names;
+}
+
+inline const char* EnumNameFeature(Feature e)
+{
+  if (::flatbuffers::IsOutRange(e, Feature_UNUSED, Feature_COMPRESSED_BODY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFeature()[index];
+}
+
+enum UnionMode : int16_t {
+  UnionMode_Sparse = 0,
+  UnionMode_Dense  = 1,
+  UnionMode_MIN    = UnionMode_Sparse,
+  UnionMode_MAX    = UnionMode_Dense
+};
+
+inline const UnionMode (&EnumValuesUnionMode())[2]
+{
+  static const UnionMode values[] = {UnionMode_Sparse, UnionMode_Dense};
+  return values;
+}
+
+inline const char* const* EnumNamesUnionMode()
+{
+  static const char* const names[3] = {"Sparse", "Dense", nullptr};
+  return names;
+}
+
+inline const char* EnumNameUnionMode(UnionMode e)
+{
+  if (::flatbuffers::IsOutRange(e, UnionMode_Sparse, UnionMode_Dense)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnionMode()[index];
+}
+
+enum Precision : int16_t {
+  Precision_HALF   = 0,
+  Precision_SINGLE = 1,
+  Precision_DOUBLE = 2,
+  Precision_MIN    = Precision_HALF,
+  Precision_MAX    = Precision_DOUBLE
+};
+
+inline const Precision (&EnumValuesPrecision())[3]
+{
+  static const Precision values[] = {Precision_HALF, Precision_SINGLE, Precision_DOUBLE};
+  return values;
+}
+
+inline const char* const* EnumNamesPrecision()
+{
+  static const char* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr};
+  return names;
+}
+
+inline const char* EnumNamePrecision(Precision e)
+{
+  if (::flatbuffers::IsOutRange(e, Precision_HALF, Precision_DOUBLE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPrecision()[index];
+}
+
+enum DateUnit : int16_t {
+  DateUnit_DAY         = 0,
+  DateUnit_MILLISECOND = 1,
+  DateUnit_MIN         = DateUnit_DAY,
+  DateUnit_MAX         = DateUnit_MILLISECOND
+};
+
+inline const DateUnit (&EnumValuesDateUnit())[2]
+{
+  static const DateUnit values[] = {DateUnit_DAY, DateUnit_MILLISECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesDateUnit()
+{
+  static const char* const names[3] = {"DAY", "MILLISECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDateUnit(DateUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, DateUnit_DAY, DateUnit_MILLISECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDateUnit()[index];
+}
+
+enum TimeUnit : int16_t {
+  TimeUnit_SECOND      = 0,
+  TimeUnit_MILLISECOND = 1,
+  TimeUnit_MICROSECOND = 2,
+  TimeUnit_NANOSECOND  = 3,
+  TimeUnit_MIN         = TimeUnit_SECOND,
+  TimeUnit_MAX         = TimeUnit_NANOSECOND
+};
+
+inline const TimeUnit (&EnumValuesTimeUnit())[4]
+{
+  static const TimeUnit values[] = {
+    TimeUnit_SECOND, TimeUnit_MILLISECOND, TimeUnit_MICROSECOND, TimeUnit_NANOSECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesTimeUnit()
+{
+  static const char* const names[5] = {
+    "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameTimeUnit(TimeUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, TimeUnit_SECOND, TimeUnit_NANOSECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTimeUnit()[index];
+}
+
+enum IntervalUnit : int16_t {
+  IntervalUnit_YEAR_MONTH     = 0,
+  IntervalUnit_DAY_TIME       = 1,
+  IntervalUnit_MONTH_DAY_NANO = 2,
+  IntervalUnit_MIN            = IntervalUnit_YEAR_MONTH,
+  IntervalUnit_MAX            = IntervalUnit_MONTH_DAY_NANO
+};
+
+inline const IntervalUnit (&EnumValuesIntervalUnit())[3]
+{
+  static const IntervalUnit values[] = {
+    IntervalUnit_YEAR_MONTH, IntervalUnit_DAY_TIME, IntervalUnit_MONTH_DAY_NANO};
+  return values;
+}
+
+inline const char* const* EnumNamesIntervalUnit()
+{
+  static const char* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr};
+  return names;
+}
+
+inline const char* EnumNameIntervalUnit(IntervalUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, IntervalUnit_YEAR_MONTH, IntervalUnit_MONTH_DAY_NANO)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesIntervalUnit()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+enum Type : uint8_t {
+  Type_NONE            = 0,
+  Type_Null            = 1,
+  Type_Int             = 2,
+  Type_FloatingPoint   = 3,
+  Type_Binary          = 4,
+  Type_Utf8            = 5,
+  Type_Bool            = 6,
+  Type_Decimal         = 7,
+  Type_Date            = 8,
+  Type_Time            = 9,
+  Type_Timestamp       = 10,
+  Type_Interval        = 11,
+  Type_List            = 12,
+  Type_Struct_         = 13,
+  Type_Union           = 14,
+  Type_FixedSizeBinary = 15,
+  Type_FixedSizeList   = 16,
+  Type_Map             = 17,
+  Type_Duration        = 18,
+  Type_LargeBinary     = 19,
+  Type_LargeUtf8       = 20,
+  Type_LargeList       = 21,
+  Type_RunEndEncoded   = 22,
+  Type_BinaryView      = 23,
+  Type_Utf8View        = 24,
+  Type_ListView        = 25,
+  Type_LargeListView   = 26,
+  Type_MIN             = Type_NONE,
+  Type_MAX             = Type_LargeListView
+};
+
+inline const Type (&EnumValuesType())[27]
+{
+  static const Type values[] = {
+    Type_NONE,          Type_Null,      Type_Int,           Type_FloatingPoint,
+    Type_Binary,        Type_Utf8,      Type_Bool,          Type_Decimal,
+    Type_Date,          Type_Time,      Type_Timestamp,     Type_Interval,
+    Type_List,          Type_Struct_,   Type_Union,         Type_FixedSizeBinary,
+    Type_FixedSizeList, Type_Map,       Type_Duration,      Type_LargeBinary,
+    Type_LargeUtf8,     Type_LargeList, Type_RunEndEncoded, Type_BinaryView,
+    Type_Utf8View,      Type_ListView,  Type_LargeListView};
+  return values;
+}
+
+inline const char* const* EnumNamesType()
+{
+  static const char* const names[28] = {
+    "NONE",          "Null",      "Int",           "FloatingPoint",
+    "Binary",        "Utf8",      "Bool",          "Decimal",
+    "Date",          "Time",      "Timestamp",     "Interval",
+    "List",          "Struct_",   "Union",         "FixedSizeBinary",
+    "FixedSizeList", "Map",       "Duration",      "LargeBinary",
+    "LargeUtf8",     "LargeList", "RunEndEncoded", "BinaryView",
+    "Utf8View",      "ListView",  "LargeListView", nullptr};
+  return names;
+}
+
+inline const char* EnumNameType(Type e)
+{
+  if (::flatbuffers::IsOutRange(e, Type_NONE, Type_LargeListView)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesType()[index];
+}
+
+template <typename T>
+struct TypeTraits {
+  static const Type enum_value = Type_NONE;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Null> {
+  static const Type enum_value = Type_Null;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Int> {
+  static const Type enum_value = Type_Int;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FloatingPoint> {
+  static const Type enum_value = Type_FloatingPoint;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Binary> {
+  static const Type enum_value = Type_Binary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8> {
+  static const Type enum_value = Type_Utf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Bool> {
+  static const Type enum_value = Type_Bool;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Decimal> {
+  static const Type enum_value = Type_Decimal;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Date> {
+  static const Type enum_value = Type_Date;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Time> {
+  static const Type enum_value = Type_Time;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Timestamp> {
+  static const Type enum_value = Type_Timestamp;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Interval> {
+  static const Type enum_value = Type_Interval;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::List> {
+  static const Type enum_value = Type_List;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Struct_> {
+  static const Type enum_value = Type_Struct_;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Union> {
+  static const Type enum_value = Type_Union;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeBinary> {
+  static const Type enum_value = Type_FixedSizeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeList> {
+  static const Type enum_value = Type_FixedSizeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Map> {
+  static const Type enum_value = Type_Map;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Duration> {
+  static const Type enum_value = Type_Duration;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeBinary> {
+  static const Type enum_value = Type_LargeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeUtf8> {
+  static const Type enum_value = Type_LargeUtf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeList> {
+  static const Type enum_value = Type_LargeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::RunEndEncoded> {
+  static const Type enum_value = Type_RunEndEncoded;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::BinaryView> {
+  static const Type enum_value = Type_BinaryView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8View> {
+  static const Type enum_value = Type_Utf8View;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::ListView> {
+  static const Type enum_value = Type_ListView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeListView> {
+  static const Type enum_value = Type_LargeListView;
+};
+
+bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type);
+bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                      const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                      const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : int16_t {
+  DictionaryKind_DenseArray = 0,
+  DictionaryKind_MIN        = DictionaryKind_DenseArray,
+  DictionaryKind_MAX        = DictionaryKind_DenseArray
+};
+
+inline const DictionaryKind (&EnumValuesDictionaryKind())[1]
+{
+  static const DictionaryKind values[] = {DictionaryKind_DenseArray};
+  return values;
+}
+
+inline const char* const* EnumNamesDictionaryKind()
+{
+  static const char* const names[2] = {"DenseArray", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDictionaryKind(DictionaryKind e)
+{
+  if (::flatbuffers::IsOutRange(e, DictionaryKind_DenseArray, DictionaryKind_DenseArray)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDictionaryKind()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+enum Endianness : int16_t {
+  Endianness_Little = 0,
+  Endianness_Big    = 1,
+  Endianness_MIN    = Endianness_Little,
+  Endianness_MAX    = Endianness_Big
+};
+
+inline const Endianness (&EnumValuesEndianness())[2]
+{
+  static const Endianness values[] = {Endianness_Little, Endianness_Big};
+  return values;
+}
+
+inline const char* const* EnumNamesEndianness()
+{
+  static const char* const names[3] = {"Little", "Big", nullptr};
+  return names;
+}
+
+inline const char* EnumNameEndianness(Endianness e)
+{
+  if (::flatbuffers::IsOutRange(e, Endianness_Little, Endianness_Big)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEndianness()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Buffer FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t offset_;
+  int64_t length_;
+
+ public:
+  Buffer() : offset_(0), length_(0) {}
+  Buffer(int64_t _offset, int64_t _length)
+    : offset_(::flatbuffers::EndianScalar(_offset)), length_(::flatbuffers::EndianScalar(_length))
+  {
+  }
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  int64_t offset() const { return ::flatbuffers::EndianScalar(offset_); }
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+};
+FLATBUFFERS_STRUCT_END(Buffer, 16);
+
+/// These are stored in the flatbuffer in the Type union below
+struct Null FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NullBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct NullBuilder {
+  typedef Null Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NullBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Null> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Null>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Null> CreateNull(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  NullBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+struct Struct_ FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Struct_Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Struct_Builder {
+  typedef Struct_ Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Struct_Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Struct_> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Struct_>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Struct_> CreateStruct_(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Struct_Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListBuilder {
+  typedef List Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<List> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<List>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<List> CreateList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListBuilder {
+  typedef LargeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeList> CreateLargeList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+struct ListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListViewBuilder {
+  typedef ListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<ListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ListView> CreateListView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+struct LargeListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListViewBuilder {
+  typedef LargeListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeListView> CreateLargeListView(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_LISTSIZE = 4 };
+  /// Number of list items per value
+  int32_t listSize() const { return GetField<int32_t>(VT_LISTSIZE, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_LISTSIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeListBuilder {
+  typedef FixedSizeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_listSize(int32_t listSize)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeList::VT_LISTSIZE, listSize, 0);
+  }
+  explicit FixedSizeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeList> CreateFixedSizeList(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t listSize = 0)
+{
+  FixedSizeListBuilder builder_(_fbb);
+  builder_.add_listSize(listSize);
+  return builder_.Finish();
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+struct Map FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MapBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEYSSORTED = 4 };
+  /// Set to true if the keys within each value are sorted
+  bool keysSorted() const { return GetField<uint8_t>(VT_KEYSSORTED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<uint8_t>(verifier, VT_KEYSSORTED, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct MapBuilder {
+  typedef Map Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keysSorted(bool keysSorted)
+  {
+    fbb_.AddElement<uint8_t>(Map::VT_KEYSSORTED, static_cast<uint8_t>(keysSorted), 0);
+  }
+  explicit MapBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Map> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Map>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Map> CreateMap(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            bool keysSorted = false)
+{
+  MapBuilder builder_(_fbb);
+  builder_.add_keysSorted(keysSorted);
+  return builder_.Finish();
+}
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+struct Union FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_MODE = 4, VT_TYPEIDS = 6 };
+  cudf::io::parquet::flatbuf::UnionMode mode() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::UnionMode>(GetField<int16_t>(VT_MODE, 0));
+  }
+  const ::flatbuffers::Vector<int32_t>* typeIds() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int32_t>*>(VT_TYPEIDS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_MODE, 2) &&
+           VerifyOffset(verifier, VT_TYPEIDS) && verifier.VerifyVector(typeIds()) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnionBuilder {
+  typedef Union Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_mode(cudf::io::parquet::flatbuf::UnionMode mode)
+  {
+    fbb_.AddElement<int16_t>(Union::VT_MODE, static_cast<int16_t>(mode), 0);
+  }
+  void add_typeIds(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds)
+  {
+    fbb_.AddOffset(Union::VT_TYPEIDS, typeIds);
+  }
+  explicit UnionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Union> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Union>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Union> CreateUnion(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds = 0)
+{
+  UnionBuilder builder_(_fbb);
+  builder_.add_typeIds(typeIds);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Union> CreateUnionDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  const std::vector<int32_t>* typeIds        = nullptr)
+{
+  auto typeIds__ = typeIds ? _fbb.CreateVector<int32_t>(*typeIds) : 0;
+  return cudf::io::parquet::flatbuf::CreateUnion(_fbb, mode, typeIds__);
+}
+
+struct Int FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BITWIDTH  = 4,
+    VT_IS_SIGNED = 6
+  };
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 0); }
+  bool is_signed() const { return GetField<uint8_t>(VT_IS_SIGNED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) &&
+           VerifyField<uint8_t>(verifier, VT_IS_SIGNED, 1) && verifier.EndTable();
+  }
+};
+
+struct IntBuilder {
+  typedef Int Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Int::VT_BITWIDTH, bitWidth, 0); }
+  void add_is_signed(bool is_signed)
+  {
+    fbb_.AddElement<uint8_t>(Int::VT_IS_SIGNED, static_cast<uint8_t>(is_signed), 0);
+  }
+  explicit IntBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Int>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int> CreateInt(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            int32_t bitWidth = 0,
+                                            bool is_signed   = false)
+{
+  IntBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_is_signed(is_signed);
+  return builder_.Finish();
+}
+
+struct FloatingPoint FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloatingPointBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_PRECISION = 4 };
+  cudf::io::parquet::flatbuf::Precision precision() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Precision>(GetField<int16_t>(VT_PRECISION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_PRECISION, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatingPointBuilder {
+  typedef FloatingPoint Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(cudf::io::parquet::flatbuf::Precision precision)
+  {
+    fbb_.AddElement<int16_t>(FloatingPoint::VT_PRECISION, static_cast<int16_t>(precision), 0);
+  }
+  explicit FloatingPointBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloatingPoint> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FloatingPoint>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloatingPoint> CreateFloatingPoint(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Precision precision = cudf::io::parquet::flatbuf::Precision_HALF)
+{
+  FloatingPointBuilder builder_(_fbb);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Unicode with UTF-8 encoding
+struct Utf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8Builder {
+  typedef Utf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8> CreateUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Opaque binary data
+struct Binary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryBuilder {
+  typedef Binary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Binary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Binary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Binary> CreateBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeUtf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeUtf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeUtf8Builder {
+  typedef LargeUtf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeUtf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeUtf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeUtf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeUtf8> CreateLargeUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeUtf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeBinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeBinaryBuilder {
+  typedef LargeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeBinary> CreateLargeBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeBinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct Utf8View FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8ViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8ViewBuilder {
+  typedef Utf8View Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8ViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8View> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8View>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8View> CreateUtf8View(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8ViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct BinaryView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryViewBuilder {
+  typedef BinaryView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BinaryView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BinaryView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BinaryView> CreateBinaryView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeBinaryBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_BYTEWIDTH = 4 };
+  /// Number of bytes per value
+  int32_t byteWidth() const { return GetField<int32_t>(VT_BYTEWIDTH, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BYTEWIDTH, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeBinaryBuilder {
+  typedef FixedSizeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_byteWidth(int32_t byteWidth)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeBinary::VT_BYTEWIDTH, byteWidth, 0);
+  }
+  explicit FixedSizeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeBinary> CreateFixedSizeBinary(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t byteWidth = 0)
+{
+  FixedSizeBinaryBuilder builder_(_fbb);
+  builder_.add_byteWidth(byteWidth);
+  return builder_.Finish();
+}
+
+struct Bool FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BoolBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BoolBuilder {
+  typedef Bool Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BoolBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Bool> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Bool>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Bool> CreateBool(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BoolBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+struct RunEndEncoded FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RunEndEncodedBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct RunEndEncodedBuilder {
+  typedef RunEndEncoded Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RunEndEncodedBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RunEndEncoded> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RunEndEncoded>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RunEndEncoded> CreateRunEndEncoded(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  RunEndEncodedBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+struct Decimal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DecimalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PRECISION = 4,
+    VT_SCALE     = 6,
+    VT_BITWIDTH  = 8
+  };
+  /// Total number of decimal digits
+  int32_t precision() const { return GetField<int32_t>(VT_PRECISION, 0); }
+  /// Number of digits after the decimal point "."
+  int32_t scale() const { return GetField<int32_t>(VT_SCALE, 0); }
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 128); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_PRECISION, 4) &&
+           VerifyField<int32_t>(verifier, VT_SCALE, 4) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct DecimalBuilder {
+  typedef Decimal Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(int32_t precision)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_PRECISION, precision, 0);
+  }
+  void add_scale(int32_t scale) { fbb_.AddElement<int32_t>(Decimal::VT_SCALE, scale, 0); }
+  void add_bitWidth(int32_t bitWidth)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_BITWIDTH, bitWidth, 128);
+  }
+  explicit DecimalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Decimal> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Decimal>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Decimal> CreateDecimal(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                    int32_t precision = 0,
+                                                    int32_t scale     = 0,
+                                                    int32_t bitWidth  = 128)
+{
+  DecimalBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_scale(scale);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+struct Date FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DateBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::DateUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DateUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DateBuilder {
+  typedef Date Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::DateUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Date::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DateBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Date> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Date>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Date> CreateDate(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::DateUnit unit = cudf::io::parquet::flatbuf::DateUnit_MILLISECOND)
+{
+  DateBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+struct Time FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_BITWIDTH = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 32); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct TimeBuilder {
+  typedef Time Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Time::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Time::VT_BITWIDTH, bitWidth, 32); }
+  explicit TimeBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Time> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Time>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Time> CreateTime(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND,
+  int32_t bitWidth                          = 32)
+{
+  TimeBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+struct Timestamp FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimestampBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_TIMEZONE = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  const ::flatbuffers::String* timezone() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_TIMEZONE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyOffset(verifier, VT_TIMEZONE) && verifier.VerifyString(timezone()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TimestampBuilder {
+  typedef Timestamp Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Timestamp::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  void add_timezone(::flatbuffers::Offset<::flatbuffers::String> timezone)
+  {
+    fbb_.AddOffset(Timestamp::VT_TIMEZONE, timezone);
+  }
+  explicit TimestampBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Timestamp> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Timestamp>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestamp(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  ::flatbuffers::Offset<::flatbuffers::String> timezone = 0)
+{
+  TimestampBuilder builder_(_fbb);
+  builder_.add_timezone(timezone);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestampDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  const char* timezone                      = nullptr)
+{
+  auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
+  return cudf::io::parquet::flatbuf::CreateTimestamp(_fbb, unit, timezone__);
+}
+
+struct Interval FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntervalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::IntervalUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::IntervalUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntervalBuilder {
+  typedef Interval Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::IntervalUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Interval::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  explicit IntervalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Interval> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Interval>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Interval> CreateInterval(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::IntervalUnit unit =
+    cudf::io::parquet::flatbuf::IntervalUnit_YEAR_MONTH)
+{
+  IntervalBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+struct Duration FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DurationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DurationBuilder {
+  typedef Duration Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Duration::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DurationBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Duration> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Duration>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Duration> CreateDuration(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND)
+{
+  DurationBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef KeyValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEY = 4, VT_VALUE = 6 };
+  const ::flatbuffers::String* key() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_KEY);
+  }
+  const ::flatbuffers::String* value() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) && VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) && verifier.EndTable();
+  }
+};
+
+struct KeyValueBuilder {
+  typedef KeyValue Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key)
+  {
+    fbb_.AddOffset(KeyValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value)
+  {
+    fbb_.AddOffset(KeyValue::VT_VALUE, value);
+  }
+  explicit KeyValueBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<KeyValue> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<KeyValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValue(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> key   = 0,
+  ::flatbuffers::Offset<::flatbuffers::String> value = 0)
+{
+  KeyValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValueDirect(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                            const char* key   = nullptr,
+                                                            const char* value = nullptr)
+{
+  auto key__   = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return cudf::io::parquet::flatbuf::CreateKeyValue(_fbb, key__, value__);
+}
+
+struct DictionaryEncoding FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryEncodingBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID             = 4,
+    VT_INDEXTYPE      = 6,
+    VT_ISORDERED      = 8,
+    VT_DICTIONARYKIND = 10
+  };
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  const cudf::io::parquet::flatbuf::Int* indexType() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::Int*>(VT_INDEXTYPE);
+  }
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  bool isOrdered() const { return GetField<uint8_t>(VT_ISORDERED, 0) != 0; }
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DictionaryKind>(
+      GetField<int16_t>(VT_DICTIONARYKIND, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_INDEXTYPE) && verifier.VerifyTable(indexType()) &&
+           VerifyField<uint8_t>(verifier, VT_ISORDERED, 1) &&
+           VerifyField<int16_t>(verifier, VT_DICTIONARYKIND, 2) && verifier.EndTable();
+  }
+};
+
+struct DictionaryEncodingBuilder {
+  typedef DictionaryEncoding Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryEncoding::VT_ID, id, 0); }
+  void add_indexType(::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType)
+  {
+    fbb_.AddOffset(DictionaryEncoding::VT_INDEXTYPE, indexType);
+  }
+  void add_isOrdered(bool isOrdered)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryEncoding::VT_ISORDERED, static_cast<uint8_t>(isOrdered), 0);
+  }
+  void add_dictionaryKind(cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind)
+  {
+    fbb_.AddElement<int16_t>(
+      DictionaryEncoding::VT_DICTIONARYKIND, static_cast<int16_t>(dictionaryKind), 0);
+  }
+  explicit DictionaryEncodingBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryEncoding> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryEncoding>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryEncoding> CreateDictionaryEncoding(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                       = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType = 0,
+  bool isOrdered                                                   = false,
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind =
+    cudf::io::parquet::flatbuf::DictionaryKind_DenseArray)
+{
+  DictionaryEncodingBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_indexType(indexType);
+  builder_.add_dictionaryKind(dictionaryKind);
+  builder_.add_isOrdered(isOrdered);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FieldBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME            = 4,
+    VT_NULLABLE        = 6,
+    VT_TYPE_TYPE       = 8,
+    VT_TYPE            = 10,
+    VT_DICTIONARY      = 12,
+    VT_CHILDREN        = 14,
+    VT_CUSTOM_METADATA = 16
+  };
+  /// Name is not required, in i.e. a List
+  const ::flatbuffers::String* name() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_NAME);
+  }
+  /// Whether or not this field can contain nulls. Should be true in general.
+  bool nullable() const { return GetField<uint8_t>(VT_NULLABLE, 0) != 0; }
+  cudf::io::parquet::flatbuf::Type type_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+  }
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  const void* type() const { return GetPointer<const void*>(VT_TYPE); }
+  template <typename T>
+  const T* type_as() const;
+  const cudf::io::parquet::flatbuf::Null* type_as_Null() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Null
+             ? static_cast<const cudf::io::parquet::flatbuf::Null*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Int* type_as_Int() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Int
+             ? static_cast<const cudf::io::parquet::flatbuf::Int*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FloatingPoint* type_as_FloatingPoint() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FloatingPoint
+             ? static_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Binary* type_as_Binary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Binary
+             ? static_cast<const cudf::io::parquet::flatbuf::Binary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8* type_as_Utf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Bool* type_as_Bool() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Bool
+             ? static_cast<const cudf::io::parquet::flatbuf::Bool*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Decimal* type_as_Decimal() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Decimal
+             ? static_cast<const cudf::io::parquet::flatbuf::Decimal*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Date* type_as_Date() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Date
+             ? static_cast<const cudf::io::parquet::flatbuf::Date*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Time* type_as_Time() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Time
+             ? static_cast<const cudf::io::parquet::flatbuf::Time*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Timestamp* type_as_Timestamp() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Timestamp
+             ? static_cast<const cudf::io::parquet::flatbuf::Timestamp*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Interval* type_as_Interval() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Interval
+             ? static_cast<const cudf::io::parquet::flatbuf::Interval*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::List* type_as_List() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_List
+             ? static_cast<const cudf::io::parquet::flatbuf::List*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Struct_* type_as_Struct_() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Struct_
+             ? static_cast<const cudf::io::parquet::flatbuf::Struct_*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Union* type_as_Union() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Union
+             ? static_cast<const cudf::io::parquet::flatbuf::Union*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeBinary* type_as_FixedSizeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeList* type_as_FixedSizeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeList
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Map* type_as_Map() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Map
+             ? static_cast<const cudf::io::parquet::flatbuf::Map*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Duration* type_as_Duration() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Duration
+             ? static_cast<const cudf::io::parquet::flatbuf::Duration*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeBinary* type_as_LargeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeUtf8* type_as_LargeUtf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeUtf8
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeList* type_as_LargeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeList
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::RunEndEncoded* type_as_RunEndEncoded() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_RunEndEncoded
+             ? static_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::BinaryView* type_as_BinaryView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_BinaryView
+             ? static_cast<const cudf::io::parquet::flatbuf::BinaryView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8View* type_as_Utf8View() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8View
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8View*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::ListView* type_as_ListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_ListView
+             ? static_cast<const cudf::io::parquet::flatbuf::ListView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeListView* type_as_LargeListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeListView
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeListView*>(type())
+             : nullptr;
+  }
+  /// Present only if the field is dictionary encoded.
+  const cudf::io::parquet::flatbuf::DictionaryEncoding* dictionary() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::DictionaryEncoding*>(VT_DICTIONARY);
+  }
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_CHILDREN);
+  }
+  /// User-defined metadata
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) && VerifyField<uint8_t>(verifier, VT_NULLABLE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_TYPE_TYPE, 1) && VerifyOffset(verifier, VT_TYPE) &&
+           VerifyType(verifier, type(), type_type()) && VerifyOffset(verifier, VT_DICTIONARY) &&
+           verifier.VerifyTable(dictionary()) && VerifyOffset(verifier, VT_CHILDREN) &&
+           verifier.VerifyVector(children()) && verifier.VerifyVectorOfTables(children()) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Null* Field::type_as<cudf::io::parquet::flatbuf::Null>()
+  const
+{
+  return type_as_Null();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Int* Field::type_as<cudf::io::parquet::flatbuf::Int>()
+  const
+{
+  return type_as_Int();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FloatingPoint*
+Field::type_as<cudf::io::parquet::flatbuf::FloatingPoint>() const
+{
+  return type_as_FloatingPoint();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Binary*
+Field::type_as<cudf::io::parquet::flatbuf::Binary>() const
+{
+  return type_as_Binary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8* Field::type_as<cudf::io::parquet::flatbuf::Utf8>()
+  const
+{
+  return type_as_Utf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Bool* Field::type_as<cudf::io::parquet::flatbuf::Bool>()
+  const
+{
+  return type_as_Bool();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Decimal*
+Field::type_as<cudf::io::parquet::flatbuf::Decimal>() const
+{
+  return type_as_Decimal();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Date* Field::type_as<cudf::io::parquet::flatbuf::Date>()
+  const
+{
+  return type_as_Date();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Time* Field::type_as<cudf::io::parquet::flatbuf::Time>()
+  const
+{
+  return type_as_Time();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Timestamp*
+Field::type_as<cudf::io::parquet::flatbuf::Timestamp>() const
+{
+  return type_as_Timestamp();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Interval*
+Field::type_as<cudf::io::parquet::flatbuf::Interval>() const
+{
+  return type_as_Interval();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::List* Field::type_as<cudf::io::parquet::flatbuf::List>()
+  const
+{
+  return type_as_List();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Struct_*
+Field::type_as<cudf::io::parquet::flatbuf::Struct_>() const
+{
+  return type_as_Struct_();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Union* Field::type_as<cudf::io::parquet::flatbuf::Union>()
+  const
+{
+  return type_as_Union();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeBinary>() const
+{
+  return type_as_FixedSizeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeList*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeList>() const
+{
+  return type_as_FixedSizeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Map* Field::type_as<cudf::io::parquet::flatbuf::Map>()
+  const
+{
+  return type_as_Map();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Duration*
+Field::type_as<cudf::io::parquet::flatbuf::Duration>() const
+{
+  return type_as_Duration();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::LargeBinary>() const
+{
+  return type_as_LargeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeUtf8*
+Field::type_as<cudf::io::parquet::flatbuf::LargeUtf8>() const
+{
+  return type_as_LargeUtf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeList*
+Field::type_as<cudf::io::parquet::flatbuf::LargeList>() const
+{
+  return type_as_LargeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::RunEndEncoded*
+Field::type_as<cudf::io::parquet::flatbuf::RunEndEncoded>() const
+{
+  return type_as_RunEndEncoded();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::BinaryView*
+Field::type_as<cudf::io::parquet::flatbuf::BinaryView>() const
+{
+  return type_as_BinaryView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8View*
+Field::type_as<cudf::io::parquet::flatbuf::Utf8View>() const
+{
+  return type_as_Utf8View();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::ListView*
+Field::type_as<cudf::io::parquet::flatbuf::ListView>() const
+{
+  return type_as_ListView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeListView*
+Field::type_as<cudf::io::parquet::flatbuf::LargeListView>() const
+{
+  return type_as_LargeListView();
+}
+
+struct FieldBuilder {
+  typedef Field Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name)
+  {
+    fbb_.AddOffset(Field::VT_NAME, name);
+  }
+  void add_nullable(bool nullable)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_NULLABLE, static_cast<uint8_t>(nullable), 0);
+  }
+  void add_type_type(cudf::io::parquet::flatbuf::Type type_type)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+  }
+  void add_type(::flatbuffers::Offset<void> type) { fbb_.AddOffset(Field::VT_TYPE, type); }
+  void add_dictionary(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary)
+  {
+    fbb_.AddOffset(Field::VT_DICTIONARY, dictionary);
+  }
+  void add_children(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children)
+  {
+    fbb_.AddOffset(Field::VT_CHILDREN, children);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Field::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit FieldBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Field> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Field>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Field> CreateField(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+  bool nullable                                     = false,
+  cudf::io::parquet::flatbuf::Type type_type        = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type                  = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary = 0,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata             = 0)
+{
+  FieldBuilder builder_(_fbb);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_children(children);
+  builder_.add_dictionary(dictionary);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  builder_.add_type_type(type_type);
+  builder_.add_nullable(nullable);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Field> CreateFieldDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  const char* name                           = nullptr,
+  bool nullable                              = false,
+  cudf::io::parquet::flatbuf::Type type_type = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type           = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary      = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto children__ =
+    children
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*children)
+      : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateField(
+    _fbb, name__, nullable, type_type, type, dictionary, children__, custom_metadata__);
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENDIANNESS      = 4,
+    VT_FIELDS          = 6,
+    VT_CUSTOM_METADATA = 8,
+    VT_FEATURES        = 10
+  };
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  cudf::io::parquet::flatbuf::Endianness endianness() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Endianness>(GetField<int16_t>(VT_ENDIANNESS, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_FIELDS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  /// Features used in the stream/file.
+  const ::flatbuffers::Vector<int64_t>* features() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_FEATURES);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_ENDIANNESS, 2) &&
+           VerifyOffset(verifier, VT_FIELDS) && verifier.VerifyVector(fields()) &&
+           verifier.VerifyVectorOfTables(fields()) && VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+           verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) &&
+           VerifyOffset(verifier, VT_FEATURES) && verifier.VerifyVector(features()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_endianness(cudf::io::parquet::flatbuf::Endianness endianness)
+  {
+    fbb_.AddElement<int16_t>(Schema::VT_ENDIANNESS, static_cast<int16_t>(endianness), 0);
+  }
+  void add_fields(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields)
+  {
+    fbb_.AddOffset(Schema::VT_FIELDS, fields);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Schema::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  void add_features(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features)
+  {
+    fbb_.AddOffset(Schema::VT_FEATURES, features);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Schema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata           = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features                            = 0)
+{
+  SchemaBuilder builder_(_fbb);
+  builder_.add_features(features);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_fields(fields);
+  builder_.add_endianness(endianness);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr,
+  const std::vector<int64_t>* features = nullptr)
+{
+  auto fields__ =
+    fields ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*fields)
+           : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  auto features__ = features ? _fbb.CreateVector<int64_t>(*features) : 0;
+  return cudf::io::parquet::flatbuf::CreateSchema(
+    _fbb, endianness, fields__, custom_metadata__, features__);
+}
+
+inline bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type)
+{
+  switch (type) {
+    case Type_NONE: {
+      return true;
+    }
+    case Type_Null: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Null*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Int: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Int*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FloatingPoint: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Binary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Binary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Bool: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Bool*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Decimal: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Decimal*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Date: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Date*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Time: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Time*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Timestamp: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Timestamp*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Interval: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Interval*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_List: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::List*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Struct_: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Struct_*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Union: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Union*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Map: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Map*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Duration: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Duration*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeUtf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_RunEndEncoded: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_BinaryView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::BinaryView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8View: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8View*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_ListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::ListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                             const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                             const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyType(verifier, values->Get(i), types->GetEnum<Type>(i))) { return false; }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSchema(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSizePrefixedSchema(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline bool VerifySchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline void FinishSchemaBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                               ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/schema/Message.fbs b/cpp/src/io/parquet/ipc/schema/Message.fbs
new file mode 100644
index 00000000000..25534410597
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Message.fbs
@@ -0,0 +1,176 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+
+namespace cudf.io.parquet.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+struct FieldNode {
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  length: long;
+
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  null_count: long;
+}
+
+enum CompressionType:byte {
+  // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
+  // thereof. Not to be confused with "raw" (also called "block") format
+  // provided by lz4.h
+  LZ4_FRAME,
+
+  // Zstandard
+  ZSTD
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod:byte {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BUFFER
+}
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+table BodyCompression {
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  codec: CompressionType = LZ4_FRAME;
+
+  /// Indicates the way the record batch body was compressed
+  method: BodyCompressionMethod = BUFFER;
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+table RecordBatch {
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  length: long;
+
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  nodes: [FieldNode];
+
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  buffers: [Buffer];
+
+  /// Optional compression of the message body
+  compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+
+table DictionaryBatch {
+  id: long;
+  data: RecordBatch;
+
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  isDelta: bool = false;
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+union MessageHeader {
+  Schema
+}
+
+table Message {
+  version: cudf.io.parquet.flatbuf.MetadataVersion;
+  header: MessageHeader;
+  bodyLength: long;
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Message;
diff --git a/cpp/src/io/parquet/ipc/schema/Schema.fbs b/cpp/src/io/parquet/ipc/schema/Schema.fbs
new file mode 100644
index 00000000000..5f66e7bbd5e
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Schema.fbs
@@ -0,0 +1,591 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+/// Format Version History.
+/// Version 1.0 - Forward and backwards compatibility guaranteed.
+/// Version 1.1 - Add Decimal256.
+/// Version 1.2 - Add Interval MONTH_DAY_NANO.
+/// Version 1.3 - Add Run-End Encoded.
+/// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and
+/// LargeListView.
+
+namespace cudf.io.parquet.flatbuf;
+
+enum MetadataVersion:short {
+  /// 0.1.0 (October 2016).
+  V1,
+
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  V2,
+
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  V3,
+
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  V4,
+
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  V5,
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : long {
+  /// Needed to make flatbuffers happy.
+  UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  DICTIONARY_REPLACEMENT = 1,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  COMPRESSED_BODY = 2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+table ListView {
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+table LargeListView {
+}
+
+table FixedSizeList {
+  /// Number of list items per value
+  listSize: int;
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+table Map {
+  /// Set to true if the keys within each value are sorted
+  keysSorted: bool;
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+table Union {
+  mode: UnionMode;
+  typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+  is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+  precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+/// Opaque binary data
+table Binary {
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeUtf8 {
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeBinary {
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table Utf8View {
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table BinaryView {
+}
+
+
+table FixedSizeBinary {
+  /// Number of bytes per value
+  byteWidth: int;
+}
+
+table Bool {
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+table RunEndEncoded {
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+table Decimal {
+  /// Total number of decimal digits
+  precision: int;
+
+  /// Number of digits after the decimal point "."
+  scale: int;
+
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  bitWidth: int = 128;
+}
+
+enum DateUnit: short {
+  DAY,
+  MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+  unit: DateUnit = MILLISECOND;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+table Time {
+  unit: TimeUnit = MILLISECOND;
+  bitWidth: int = 32;
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+table Timestamp {
+  unit: TimeUnit;
+
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
+// A "calendar" interval which models types that don't necessarily
+// have a precise duration without the context of a base timestamp (e.g.
+// days can differ in length during day light savings time transitions).
+// All integers in the types below are stored in the endianness indicated
+// by the schema.
+//
+// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
+//   4-byte signed integers.
+// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
+//   stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
+//   of this IntervalUnit is not required for full arrow compatibility.
+// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
+//  The values are stored contiguously in 16-byte blocks. Months and days are
+//  encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
+//  signed integer. Nanoseconds does not allow for leap seconds. Each field is
+//  independent (e.g. there is no constraint that nanoseconds have the same
+//  sign as days or that the quantity of nanoseconds represents less than a
+//  day's worth of time).
+table Interval {
+  unit: IntervalUnit;
+}
+
+// An absolute length of time unrelated to any calendar artifacts.
+//
+// For the purposes of Arrow Implementations, adding this value to a Timestamp
+// ("t1") naively (i.e. simply summing the two number) is acceptable even
+// though in some cases the resulting Timestamp (t2) would not account for
+// leap-seconds during the elapsed time between "t1" and "t2".  Similarly,
+// representing the difference between two Unix timestamp is acceptable, but
+// would yield a value that is possibly a few seconds off from the true elapsed
+// time.
+//
+//  The resolution defaults to millisecond, but can be any of the other
+//  supported TimeUnit values as with Timestamp and Time types.  This type is
+//  always represented as an 8-byte integer.
+table Duration {
+  unit: TimeUnit = MILLISECOND;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+  Null,
+  Int,
+  FloatingPoint,
+  Binary,
+  Utf8,
+  Bool,
+  Decimal,
+  Date,
+  Time,
+  Timestamp,
+  Interval,
+  List,
+  Struct_,
+  Union,
+  FixedSizeBinary,
+  FixedSizeList,
+  Map,
+  Duration,
+  LargeBinary,
+  LargeUtf8,
+  LargeList,
+  RunEndEncoded,
+  BinaryView,
+  Utf8View,
+  ListView,
+  LargeListView,
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+  key: string;
+  value: string;
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : short { DenseArray }
+table DictionaryEncoding {
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  id: long;
+
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  indexType: Int;
+
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  isOrdered: bool;
+
+  dictionaryKind: DictionaryKind;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+
+table Field {
+  /// Name is not required, in i.e. a List
+  name: string;
+
+  /// Whether or not this field can contain nulls. Should be true in general.
+  nullable: bool;
+
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  type: Type;
+
+  /// Present only if the field is dictionary encoded.
+  dictionary: DictionaryEncoding;
+
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  children: [ Field ];
+
+  /// User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  offset: long;
+
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
+  fields: [Field];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+
+  /// Features used in the stream/file.
+  features : [ Feature ];
+}
+
+root_type Schema;
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 261e04e3f19..7207173b82f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -28,6 +28,177 @@ namespace {
 constexpr int decode_block_size = 128;
 constexpr int rolling_buf_size  = decode_block_size * 2;
 
+/**
+ * @brief Kernel for computing the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * This is basically the PLAIN decoder, but with a pared down set of supported data
+ * types, and using output functions that piece together the individual streams.
+ * Supported physical types include INT32, INT64, FLOAT, DOUBLE and FIXED_LEN_BYTE_ARRAY.
+ * The latter is currently only used for large decimals. The Parquet specification also
+ * has FLOAT16 and UUID types that are currently not supported. FIXED_LEN_BYTE_ARRAY data
+ * that lacks a `LogicalType` annotation will be handled by the string decoder.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <int lvl_buf_size, typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageData(PageInfo* pages,
+                         device_span<ColumnChunkDesc const> chunks,
+                         size_t min_row,
+                         size_t num_rows,
+                         kernel_error::pointer error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  auto const data_len    = thrust::distance(s->data_start, s->data_end);
+  auto const num_values  = data_len / s->dtype_len_in;
+  auto const out_thread0 = warp_size;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+    }
+    // this needs to be here to prevent warp 1 modifying src_pos before all threads have read it
+    __syncthreads();
+
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else {
+      // WARP1..WARP3: Decode values
+      int const dtype = s->col.physical_type;
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -2, -1, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+      // before first_row) in the flat hierarchy case.
+      if (src_pos < target_pos && dst_pos >= 0) {
+        // src_pos represents the logical row position we want to read from. But in the case of
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
+        // has to take into account the # of values we have to skip in the page to get to the
+        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        uint32_t val_src_pos = src_pos + skipped_leaf_values;
+
+        // nesting level that is storing actual leaf values
+        int leaf_level_index = s->col.max_nesting_depth - 1;
+
+        uint32_t dtype_len = s->dtype_len;
+        uint8_t const* src = s->data_start + val_src_pos;
+        uint8_t* dst =
+          nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        auto const is_decimal =
+          s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+        // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+        if (is_decimal) {
+          switch (dtype) {
+            case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+            case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+            case FIXED_LEN_BYTE_ARRAY:
+              if (s->dtype_len_in <= sizeof(int32_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              }
+              // unsupported decimal precision
+              [[fallthrough]];
+
+            default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+          }
+        } else if (dtype_len == 8) {
+          if (s->dtype_len_in == 4) {
+            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+            // TIME_MILLIS is the only duration type stored as int32:
+            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+            // zero out most significant bytes
+            memset(dst + 4, 0, 4);
+          } else if (s->ts_scale) {
+            gpuOutputSplitInt64Timestamp(
+              reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+          } else {
+            gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+          }
+        } else if (dtype_len == 4) {
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+        } else {
+          s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      }
+
+      if (t == out_thread0) { s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 /**
  * @brief Kernel for computing the column data stored in the pages
  *
@@ -77,7 +248,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
-    switch (s->col.data_type & 7) {
+    switch (s->col.physical_type) {
       case BOOLEAN: [[fallthrough]];
       case BYTE_ARRAY: [[fallthrough]];
       case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
@@ -123,16 +294,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // be needed in the other DecodeXXX kernels.
       if (s->dict_base) {
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
-      } else if ((s->col.data_type & 7) == BOOLEAN) {
+      } else if (s->col.physical_type == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY or
-                 (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+      } else if (s->col.physical_type == BYTE_ARRAY or
+                 s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
       if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
-      int const dtype = s->col.data_type & 7;
+      int const dtype = s->col.physical_type;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
@@ -145,7 +316,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -166,10 +337,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
         uint32_t dtype_len = s->dtype_len;
         void* dst =
           nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        auto const is_decimal =
+          s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
         if (dtype == BYTE_ARRAY) {
-          if (s->col.converted_type == DECIMAL) {
+          if (is_decimal) {
             auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
-            auto const decimal_precision = s->col.decimal_precision;
+            auto const decimal_precision = s->col.logical_type->precision();
             if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
               gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
             } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
@@ -182,7 +355,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
           }
         } else if (dtype == BOOLEAN) {
           gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
-        } else if (s->col.converted_type == DECIMAL) {
+        } else if (is_decimal) {
           switch (dtype) {
             case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
             case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
@@ -265,4 +438,29 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::detail::DecodePageData
+ */
+void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                  cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                  size_t num_rows,
+                                  size_t min_row,
+                                  int level_type_size,
+                                  kernel_error::pointer error_code,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageData<rolling_buf_size, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageData<rolling_buf_size, uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index f0fa7d814cf..f182747650e 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -34,8 +34,7 @@ template <typename state_buf>
 inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
-  // make sure to only hash `BYTE_ARRAY` when specified with the output type size
-  if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
+  if (s->col.is_strings_to_cat and s->col.physical_type == BYTE_ARRAY) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
     uint32_t constexpr hash_seed = 33;
@@ -397,4 +396,80 @@ inline __device__ void gpuOutputGeneric(
     }
   }
 }
+
+/**
+ * Output a BYTE_STREAM_SPLIT value of type `T`.
+ *
+ * Data is encoded as N == sizeof(T) streams of length M, forming an NxM sized matrix.
+ * Rows are streams, columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ */
+template <typename T>
+__device__ inline void gpuOutputByteStreamSplit(uint8_t* dst, uint8_t const* src, size_type stride)
+{
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i] = src[i * stride];
+  }
+}
+
+/**
+ * Output a 64-bit BYTE_STREAM_SPLIT encoded timestamp.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param ts_scale timestamp scale
+ */
+inline __device__ void gpuOutputSplitInt64Timestamp(int64_t* dst,
+                                                    uint8_t const* src,
+                                                    size_type stride,
+                                                    int32_t ts_scale)
+{
+  gpuOutputByteStreamSplit<int64_t>(reinterpret_cast<uint8_t*>(dst), src, stride);
+  if (ts_scale < 0) {
+    // round towards negative infinity
+    int sign = (*dst < 0);
+    *dst     = ((*dst + sign) / -ts_scale) + sign;
+  } else {
+    *dst = *dst * ts_scale;
+  }
+}
+
+/**
+ * Output a BYTE_STREAM_SPLIT encoded decimal as an integer type.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param dtype_len_in length of the `FIXED_LEN_BYTE_ARRAY` used to represent the decimal
+ */
+template <typename T>
+__device__ void gpuOutputSplitFixedLenByteArrayAsInt(T* dst,
+                                                     uint8_t const* src,
+                                                     size_type stride,
+                                                     uint32_t dtype_len_in)
+{
+  T unscaled = 0;
+  // fixed_len_byte_array decimals are big endian
+  for (unsigned int i = 0; i < dtype_len_in; i++) {
+    unscaled = (unscaled << 8) | src[i * stride];
+  }
+  // Shift the unscaled value up and back down when it isn't all 8 bytes,
+  // which sign extend the value for correctly representing negative numbers.
+  if (dtype_len_in < sizeof(T)) {
+    unscaled <<= (sizeof(T) - dtype_len_in) * 8;
+    unscaled >>= (sizeof(T) - dtype_len_in) * 8;
+  }
+  *dst = unscaled;
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a081ee4e03f..b1f8e6dd5fe 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -122,7 +122,7 @@ struct null_count_back_copier {
  */
 constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
   return is_string_col(col);
 }
@@ -441,7 +441,7 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta
 
     while (pos < target_pos) {
       int len = 0;
-      if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+      if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
         if (k < dict_size) { len = s->dtype_len_in; }
       } else {
         if (k + 4 <= dict_size) {
@@ -924,7 +924,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
 
   auto start = cur;
 
-  auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
+  auto init_rle = [s, lvl, level_bits](uint8_t const* cur, uint8_t const* end) {
     uint32_t const run      = get_vlq32(cur, end);
     s->initial_rle_run[lvl] = run;
     if (!(run & 1)) {
@@ -1144,11 +1144,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
     if (s->page.num_input_values > 0) {
       uint8_t* cur = s->page.page_data;
       uint8_t* end = cur + s->page.uncompressed_page_size;
-
-      uint32_t dtype_len_out = s->col.data_type >> 3;
-      s->ts_scale            = 0;
+      s->ts_scale  = 0;
       // Validate data type
-      auto const data_type = s->col.data_type & 7;
+      auto const data_type = s->col.physical_type;
+      auto const is_decimal =
+        s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
       switch (data_type) {
         case BOOLEAN:
           s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
@@ -1159,13 +1159,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->col.ts_clock_rate) {
             int32_t units = 0;
             // Duration types are not included because no scaling is done when reading
-            if (s->col.converted_type == TIMESTAMP_MILLIS) {
-              units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
-              units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.has_value() and
-                       s->col.logical_type->is_timestamp_nanos()) {
-              units = cudf::timestamp_ns::period::den;
+            if (s->col.logical_type.has_value()) {
+              auto const& lt = *s->col.logical_type;
+              if (lt.is_timestamp_millis()) {
+                units = cudf::timestamp_ms::period::den;
+              } else if (lt.is_timestamp_micros()) {
+                units = cudf::timestamp_us::period::den;
+              } else if (lt.is_timestamp_nanos()) {
+                units = cudf::timestamp_ns::period::den;
+              }
             }
             if (units and units != s->col.ts_clock_rate) {
               s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
@@ -1176,8 +1178,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
         case DOUBLE: s->dtype_len = 8; break;
         case INT96: s->dtype_len = 12; break;
         case BYTE_ARRAY:
-          if (s->col.converted_type == DECIMAL) {
-            auto const decimal_precision = s->col.decimal_precision;
+          if (is_decimal) {
+            auto const decimal_precision = s->col.logical_type->precision();
             s->dtype_len                 = [decimal_precision]() {
               if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
                 return sizeof(int32_t);
@@ -1192,14 +1194,14 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
           break;
         default:  // FIXED_LEN_BYTE_ARRAY:
-          s->dtype_len = dtype_len_out;
+          s->dtype_len = s->col.type_length;
           if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); }
           break;
       }
       // Special check for downconversions
       s->dtype_len_in = s->dtype_len;
       if (data_type == FIXED_LEN_BYTE_ARRAY) {
-        if (s->col.converted_type == DECIMAL) {
+        if (is_decimal) {
           s->dtype_len = [dtype_len = s->dtype_len]() {
             if (dtype_len <= sizeof(int32_t)) {
               return sizeof(int32_t);
@@ -1213,17 +1215,17 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dtype_len = sizeof(string_index_pair);
         }
       } else if (data_type == INT32) {
-        if (dtype_len_out == 1) {
-          // INT8 output
-          s->dtype_len = 1;
-        } else if (dtype_len_out == 2) {
-          // INT16 output
-          s->dtype_len = 2;
-        } else if (s->col.converted_type == TIME_MILLIS) {
-          // INT64 output
-          s->dtype_len = 8;
+        // check for smaller bitwidths
+        if (s->col.logical_type.has_value()) {
+          auto const& lt = *s->col.logical_type;
+          if (lt.type == LogicalType::INTEGER) {
+            s->dtype_len = lt.bit_width() / 8;
+          } else if (lt.is_time_millis()) {
+            // cudf outputs as INT64
+            s->dtype_len = 8;
+          }
         }
-      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
+      } else if (data_type == BYTE_ARRAY && s->col.is_strings_to_cat) {
         s->dtype_len = 4;  // HASH32 output
       } else if (data_type == INT96) {
         s->dtype_len = 8;  // Convert to 64-bit timestamp
@@ -1296,9 +1298,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       // be made to is_supported_encoding() in reader_impl_preprocess.cu
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
+        case Encoding::RLE_DICTIONARY: {
           // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
+          auto const is_decimal =
+            s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+          if ((s->col.physical_type == BYTE_ARRAY or
+               s->col.physical_type == FIXED_LEN_BYTE_ARRAY) and
+              not is_decimal and s->col.str_dict_index != nullptr) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
@@ -1312,11 +1318,12 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
-          break;
+        } break;
         case Encoding::PLAIN:
+        case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
           s->dict_val  = 0;
-          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
+          if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
           break;
         case Encoding::RLE: {
           // first 4 bytes are length of RLE data
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 7c0092c6185..0c9d4e77f0c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -315,7 +315,7 @@ CUDF_KERNEL void __launch_bounds__(96)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -440,7 +440,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_byte_array_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -579,15 +579,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -605,7 +608,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
   __shared__ __align__(8) uint8_t const* page_string_data;
   __shared__ size_t string_offset;
 
@@ -738,15 +741,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   // finally, copy the string data into place
   auto const dst = nesting_info_base[leaf_level_index].string_out;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index d881ab6f9b7..e9558735929 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -15,6 +15,7 @@
  */
 
 #include "delta_enc.cuh"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "page_string_utils.cuh"
 #include "parquet_gpu.cuh"
@@ -108,10 +109,10 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id)
+constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
-  if (physical_type == FIXED_LEN_BYTE_ARRAY and id == type_id::DECIMAL128) {
-    return sizeof(__int128_t);
+  if (physical_type == FIXED_LEN_BYTE_ARRAY) {
+    return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
   }
   switch (physical_type) {
     case INT96: return 12u;
@@ -182,7 +183,7 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   auto const physical_type   = s->col.physical_type;
   auto const leaf_type       = s->col.leaf_column->type().id();
-  auto const dtype_len       = physical_type_len(physical_type, leaf_type);
+  auto const dtype_len       = physical_type_len(physical_type, leaf_type, s->col.type_length);
   auto const nvals           = s->frag.num_leaf_values;
   auto const start_value_idx = s->frag.start_value_idx;
 
@@ -238,8 +239,10 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
-                                       bool write_v2_headers)
+                                       bool write_v2_headers,
+                                       bool is_split_stream)
 {
+  if (is_split_stream) { return Encoding::BYTE_STREAM_SPLIT; }
   // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
   // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
   // data pages (actual encoding is identical).
@@ -514,6 +517,7 @@ __device__ encode_kernel_mask data_encoding_for_col(EncColumnChunk const* chunk,
       case column_encoding::DELTA_BINARY_PACKED: return encode_kernel_mask::DELTA_BINARY;
       case column_encoding::DELTA_LENGTH_BYTE_ARRAY: return encode_kernel_mask::DELTA_LENGTH_BA;
       case column_encoding::DELTA_BYTE_ARRAY: return encode_kernel_mask::DELTA_BYTE_ARRAY;
+      case column_encoding::BYTE_STREAM_SPLIT: return encode_kernel_mask::BYTE_STREAM_SPLIT;
     }
   }
 
@@ -537,7 +541,8 @@ __device__ size_t delta_data_len(Type physical_type,
                                  size_t page_size,
                                  encode_kernel_mask encoding)
 {
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  // dtype_len_out is for the lengths, rather than the char data, so pass sizeof(int32_t)
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, sizeof(int32_t));
   auto const dtype_len     = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1591,7 +1596,9 @@ __device__ void finish_page_encode(state_buf* s,
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      auto const status =
+        s->col.skip_compression ? compression_status::SKIPPED : compression_status::FAILURE;
+      comp_results[blockIdx.x]   = {0, status};
       pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
@@ -1606,6 +1613,19 @@ __device__ void finish_page_encode(state_buf* s,
   }
 }
 
+// Encode a fixed-width data type int `dst`. `dst` points to the first byte
+// of the result. `stride` is 1 for PLAIN encoding and num_values for
+// BYTE_STREAM_SPLIT.
+template <typename T>
+__device__ inline void encode_value(uint8_t* dst, T src, size_type stride)
+{
+  T v = src;
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i * stride] = v;
+    v >>= 8;
+  }
+}
+
 // PLAIN page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
@@ -1614,7 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+                 bool write_v2_headers,
+                 bool is_split_stream)
 {
   __shared__ __align__(8) page_enc_state_s<0> state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -1634,13 +1655,15 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }
   __syncthreads();
 
-  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+  auto const allowed_mask =
+    is_split_stream ? encode_kernel_mask::BYTE_STREAM_SPLIT : encode_kernel_mask::PLAIN;
+  if (BitAnd(s->page.kernel_mask, allowed_mask) == 0) { return; }
 
   // Encode data values
   __syncthreads();
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1648,18 +1671,20 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }();
 
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, is_split_stream);
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
 
+  auto const stride = is_split_stream ? s->page.num_valid : 1;
+
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
     uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
@@ -1706,6 +1731,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     uint32_t total_len = 0;
     block_scan(scan_storage).ExclusiveSum(len, pos, total_len);
     __syncthreads();
+
+    // if BYTE_STREAM_SPLIT, then translate byte positions to indexes
+    if (is_split_stream) {
+      pos /= dtype_len_out;
+      total_len /= dtype_len_out;
+    }
+
     if (t == 0) { s->cur = dst + total_len; }
     if (is_valid) {
       switch (physical_type) {
@@ -1723,13 +1755,11 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             }
           }();
 
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          encode_value(dst + pos, v, stride);
         } break;
+        case DOUBLE:
         case INT64: {
-          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          auto v           = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
             if (ts_scale < 0) {
@@ -1738,16 +1768,10 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               v *= ts_scale;
             }
           }
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
-          dst[pos + 4] = v >> 32;
-          dst[pos + 5] = v >> 40;
-          dst[pos + 6] = v >> 48;
-          dst[pos + 7] = v >> 56;
+          encode_value(dst + pos, v, stride);
         } break;
         case INT96: {
+          // only PLAIN encoding is supported
           int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
@@ -1774,27 +1798,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
           }();
 
           // the 12 bytes of fixed length data.
-          v             = last_day_nanos.count();
-          dst[pos + 0]  = v;
-          dst[pos + 1]  = v >> 8;
-          dst[pos + 2]  = v >> 16;
-          dst[pos + 3]  = v >> 24;
-          dst[pos + 4]  = v >> 32;
-          dst[pos + 5]  = v >> 40;
-          dst[pos + 6]  = v >> 48;
-          dst[pos + 7]  = v >> 56;
-          uint32_t w    = julian_days.count();
-          dst[pos + 8]  = w;
-          dst[pos + 9]  = w >> 8;
-          dst[pos + 10] = w >> 16;
-          dst[pos + 11] = w >> 24;
+          v = last_day_nanos.count();
+          encode_value(dst + pos, v, 1);
+          uint32_t w = julian_days.count();
+          encode_value(dst + pos + 8, w, 1);
         } break;
 
-        case DOUBLE: {
-          auto v = s->col.leaf_column->element<double>(val_idx);
-          memcpy(dst + pos, &v, 8);
-        } break;
         case BYTE_ARRAY: {
+          // only PLAIN encoding is supported
           auto const bytes = [](cudf::type_id const type_id,
                                 column_device_view const* leaf_column,
                                 uint32_t const val_idx) -> void const* {
@@ -1808,11 +1819,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
           }(type_id, s->col.leaf_column, val_idx);
-          uint32_t v   = len - 4;  // string length
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          uint32_t v = len - 4;  // string length
+          encode_value(dst + pos, v, 1);
           if (v != 0) memcpy(dst + pos + 4, bytes, v);
         } break;
         case FIXED_LEN_BYTE_ARRAY: {
@@ -1820,10 +1828,29 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
             auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
             auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-            thrust::copy(thrust::seq,
-                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                         thrust::make_reverse_iterator(v_char_ptr),
-                         dst + pos);
+            if (is_split_stream) {
+              for (int i = dtype_len_out - 1; i >= 0; i--, pos += stride) {
+                dst[pos] = v_char_ptr[i];
+              }
+            } else {
+              thrust::copy(thrust::seq,
+                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                           thrust::make_reverse_iterator(v_char_ptr),
+                           dst + pos);
+            }
+          } else {
+            auto const elem =
+              get_element<statistics::byte_array_view>(*(s->col.leaf_column), val_idx);
+            if (len != 0 and elem.data() != nullptr) {
+              if (is_split_stream) {
+                auto const v_char_ptr = reinterpret_cast<uint8_t const*>(elem.data());
+                for (int i = 0; i < dtype_len_out; i++, pos += stride) {
+                  dst[pos] = v_char_ptr[i];
+                }
+              } else {
+                memcpy(dst + pos, elem.data(), len);
+              }
+            }
           }
         } break;
       }
@@ -1831,6 +1858,9 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     __syncthreads();
   }
 
+  // for BYTE_STREAM_SPLIT, s->cur now points to the end of the first stream.
+  // need it to point to the end of the Nth stream.
+  if (is_split_stream and t == 0) { s->cur += (dtype_len_out - 1) * s->page.num_valid; }
   finish_page_encode<block_size>(
     s, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
 }
@@ -1868,7 +1898,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1881,13 +1911,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                            ? s->ck.dict_rle_bits
                            : -1;
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, false);
     if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
@@ -1896,6 +1926,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
       s->rle_len_pos = dst;
     }
+    s->cur             = s->rle_out;
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
@@ -1999,7 +2030,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -2494,6 +2525,7 @@ CUDF_KERNEL void __launch_bounds__(decide_compression_block_size)
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       auto const lvl_bytes = curr_page.is_v2() ? curr_page.level_bytes() : 0;
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
+      // TODO: would this be better as a ballot?
       if (comp_res->status != compression_status::SUCCESS) {
         atomicOr(&compression_error[warp_id], 1);
       }
@@ -2926,6 +2958,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
     auto const [min_ptr, min_size] =
       get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS);
     encoder.field_binary(6, min_ptr, min_size);
+    // cudf min/max statistics are always exact (i.e. not truncated)
+    encoder.field_bool(7, true);
+    encoder.field_bool(8, true);
   }
   encoder.end(&end);
   return end;
@@ -3197,7 +3232,7 @@ __device__ int32_t calculate_boundary_order(statistics_chunk const* s,
 }
 
 // align ptr to an 8-byte boundary. address returned will be <= ptr.
-constexpr __device__ void* align8(void* ptr)
+inline __device__ void* align8(void* ptr)
 {
   // it's ok to round down because we have an extra 7 bytes in the buffer
   auto algn = 3 & reinterpret_cast<std::uintptr_t>(ptr);
@@ -3413,7 +3448,14 @@ void EncodePages(device_span<EncPage> pages,
     gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
       pages, write_v2_headers, encode_kernel_mask::PLAIN);
     gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
-      pages, comp_in, comp_out, comp_results, write_v2_headers);
+      pages, comp_in, comp_out, comp_results, write_v2_headers, false);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::BYTE_STREAM_SPLIT);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers, true);
   }
   if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
     auto const strm = streams[s_idx++];
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 4a50c7445b3..cf0dd85e490 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -147,12 +147,12 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
 
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BYTE_ARRAY;
+  return chunk.physical_type == BYTE_ARRAY;
 }
 
 __device__ inline bool is_boolean(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BOOLEAN;
+  return chunk.physical_type == BOOLEAN;
 }
 
 /**
@@ -166,13 +166,7 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
                                                    ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
-  if (!is_string_col(chunk) && !is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
-    if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
-    } else if (page.encoding == Encoding::PLAIN_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
-    }
-  }
+
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
     return decode_kernel_mask::DELTA_BINARY;
   } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
@@ -180,10 +174,26 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (page.encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     return decode_kernel_mask::DELTA_LENGTH_BA;
   } else if (is_string_col(chunk)) {
+    // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
   }
 
-  // non-string, non-delta
+  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+    if (page.encoding == Encoding::PLAIN) {
+      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+    } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
+               page.encoding == Encoding::RLE_DICTIONARY) {
+      return decode_kernel_mask::FIXED_WIDTH_DICT;
+    } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+    }
+  }
+
+  if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+    return decode_kernel_mask::BYTE_STREAM_SPLIT;
+  }
+
+  // non-string, non-delta, non-split_stream
   return decode_kernel_mask::GENERAL;
 }
 
@@ -528,17 +538,28 @@ CUDF_KERNEL void __launch_bounds__(128)
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
-      if (cur + 4 <= dict_size) {
-        len = dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
-        if (len >= 0 && cur + 4 + len <= dict_size) {
+      if (ck->physical_type == FIXED_LEN_BYTE_ARRAY) {
+        if (cur + ck->type_length <= dict_size) {
+          len = ck->type_length;
           pos = cur;
-          cur = cur + 4 + len;
+          cur += len;
         } else {
           cur = dict_size;
         }
+      } else {
+        if (cur + 4 <= dict_size) {
+          len =
+            dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
+          if (len >= 0 && cur + 4 + len <= dict_size) {
+            pos = cur + 4;
+            cur = pos + len;
+          } else {
+            cur = dict_size;
+          }
+        }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
       dict_index[i].second = len;
     }
   }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d8b1c1cc046..ba3d35b9586 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -689,7 +689,7 @@ CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPage
   auto const start_value = pp->start_val;
 
   // if data size is known, can short circuit here
-  if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+  if (chunks[pp->chunk_idx].physical_type == FIXED_LEN_BYTE_ARRAY) {
     if (t == 0) {
       pp->str_bytes = pp->num_valids * s->dtype_len_in;
 
@@ -881,7 +881,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi
   auto const& col  = s->col;
   size_t str_bytes = 0;
   // short circuit for FIXED_LEN_BYTE_ARRAY
-  if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+  if (col.physical_type == FIXED_LEN_BYTE_ARRAY) {
     str_bytes = pp->num_valids * s->dtype_len_in;
   } else {
     // now process string info in the range [start_value, end_value)
@@ -955,7 +955,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(4) size_type last_offset;
+  __shared__ size_t last_offset;
   __shared__ __align__(16)
     page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
       state_buffers;
@@ -1039,7 +1039,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -1054,19 +1054,31 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                               ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                               : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
-          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset, warp_total;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset, warp_total);
+          __shared__ cub::WarpScan<size_t>::TempStorage temp_storage;
+          size_t offset, warp_total;
+          cub::WarpScan<size_t>(temp_storage).ExclusiveSum(len, offset, warp_total);
           offset += last_offset;
 
           // choose a character parallel string copy when the average string is longer than a warp
           auto const use_char_ll = warp_total / warp_size >= warp_size;
 
-          if (use_char_ll) {
-            __shared__ __align__(8) uint8_t const* pointers[warp_size];
-            __shared__ __align__(4) size_type offsets[warp_size];
-            __shared__ __align__(4) int dsts[warp_size];
-            __shared__ __align__(4) int lengths[warp_size];
+          if (s->page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto const stride = s->page.str_bytes / s->dtype_len_in;
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
+              for (int ii = 0; ii < s->dtype_len_in; ii++) {
+                str_ptr[ii] = s->data_start[src_pos + i + ii * stride];
+              }
+            }
+            __syncwarp();
+          } else if (use_char_ll) {
+            __shared__ uint8_t const* pointers[warp_size];
+            __shared__ size_t offsets[warp_size];
+            __shared__ int dsts[warp_size];
+            __shared__ int lengths[warp_size];
 
             offsets[me]  = offset;
             pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
@@ -1107,15 +1119,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -1182,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
   cudf::detail::join_streams(streams, stream);
 
   // check for needed temp space for DELTA_BYTE_ARRAY
-  auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
-      return page.temp_string_size != 0;
-    });
+  auto const need_sizes =
+    thrust::any_of(rmm::exec_policy(stream),
+                   pages.device_begin(),
+                   pages.device_end(),
+                   cuda::proclaim_return_type<bool>(
+                     [] __device__(auto& page) { return page.temp_string_size != 0; }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes
-    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const page_sizes = cuda::proclaim_return_type<int64_t>(
+      [] __device__(PageInfo const& page) { return page.temp_string_size; });
     auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
                                                      pages.device_begin(),
                                                      pages.device_end(),
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 08f9fae145b..e35742c2527 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -183,6 +183,9 @@ struct SchemaElement {
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
+  // cudf type determined from arrow:schema
+  thrust::optional<type_id> arrow_type;
+
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
@@ -259,6 +262,10 @@ struct Statistics {
   thrust::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
   thrust::optional<std::vector<uint8_t>> min_value;
+  // If true, max_value is the actual maximum value for a column
+  thrust::optional<bool> is_max_value_exact;
+  // If true, min_value is the actual minimum value for a column
+  thrust::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -322,25 +329,58 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing page encoding statistics
+ */
+struct PageEncodingStats {
+  PageType page_type;  // The page type (data/dic/...)
+  Encoding encoding;   // Encoding of the page
+  int32_t count;       // Number of pages of this type with this encoding
+};
+
+/**
+ * @brief Thrift-derived struct describing column sort order
+ */
+struct SortingColumn {
+  int32_t column_idx;  // The column index (in this row group)
+  bool descending;     // If true, indicates this column is sorted in descending order
+  bool nulls_first;    // If true, nulls will come before non-null values
+};
+
 /**
  * @brief Thrift-derived struct describing a column chunk
  */
 struct ColumnChunkMetaData {
+  // Type of this column
   Type type = BOOLEAN;
+  // Set of all encodings used for this column. The purpose is to validate
+  // whether we can decode those pages.
   std::vector<Encoding> encodings;
+  // Path in schema
   std::vector<std::string> path_in_schema;
-  Compression codec  = UNCOMPRESSED;
+  // Compression codec
+  Compression codec = UNCOMPRESSED;
+  // Number of values in this column
   int64_t num_values = 0;
-  int64_t total_uncompressed_size =
-    0;  // total byte size of all uncompressed pages in this column chunk (including the headers)
-  int64_t total_compressed_size =
-    0;  // total byte size of all compressed pages in this column chunk (including the headers)
-  int64_t data_page_offset  = 0;  // Byte offset from beginning of file to first data page
-  int64_t index_page_offset = 0;  // Byte offset from beginning of file to root index page
-  int64_t dictionary_page_offset =
-    0;                    // Byte offset from the beginning of file to first (only) dictionary page
-  Statistics statistics;  // Encoded chunk-level statistics
-  thrust::optional<SizeStatistics> size_statistics;  // Size statistics for the chunk
+  // Total byte size of all uncompressed pages in this column chunk (including the headers)
+  int64_t total_uncompressed_size = 0;
+  // Total byte size of all compressed pages in this column chunk (including the headers)
+  int64_t total_compressed_size = 0;
+  // Byte offset from beginning of file to first data page
+  int64_t data_page_offset = 0;
+  // Byte offset from beginning of file to root index page
+  int64_t index_page_offset = 0;
+  // Byte offset from the beginning of file to first (only) dictionary page
+  int64_t dictionary_page_offset = 0;
+  // Optional statistics for this column chunk
+  Statistics statistics;
+  // Set of all encodings used for pages in this column chunk. This information can be used to
+  // determine if all data pages are dictionary encoded for example.
+  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  // Optional statistics to help estimate total memory when converted to in-memory representations.
+  // The histograms contained in these statistics can also be useful in some cases for more
+  // fine-grained nullability/list length filter pushdown.
+  thrust::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -374,9 +414,21 @@ struct ColumnChunk {
  * consisting of a column chunk for each column.
  */
 struct RowGroup {
-  int64_t total_byte_size = 0;
+  // Metadata for each column chunk in this row group.
   std::vector<ColumnChunk> columns;
+  // Total byte size of all the uncompressed column data in this row group
+  int64_t total_byte_size = 0;
+  // Number of rows in this row group
   int64_t num_rows = 0;
+  // If set, specifies a sort ordering of the rows in this RowGroup.
+  // The sorting columns can be a subset of all the columns.
+  thrust::optional<std::vector<SortingColumn>> sorting_columns;
+  // Byte offset from beginning of file to first page (data or dictionary) in this row group
+  thrust::optional<int64_t> file_offset;
+  // Total byte size of all compressed (and potentially encrypted) column data in this row group
+  thrust::optional<int64_t> total_compressed_size;
+  // Row group ordinal in the file
+  thrust::optional<int16_t> ordinal;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 82ccb2b314a..e3e4d8736c7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -54,7 +54,13 @@ constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 template <int rolling_size>
 constexpr int rolling_index(int index)
 {
-  return index % rolling_size;
+  // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
+  // actual be executed.
+  if constexpr (rolling_size == 0) {
+    return index;
+  } else {
+    return index % rolling_size;
+  }
 }
 
 // PARQUET-2261 allows for not writing the level histograms in certain cases.
@@ -81,7 +87,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::RLE_DICTIONARY:
     case Encoding::DELTA_BINARY_PACKED:
     case Encoding::DELTA_LENGTH_BYTE_ARRAY:
-    case Encoding::DELTA_BYTE_ARRAY: return true;
+    case Encoding::DELTA_BYTE_ARRAY:
+    case Encoding::BYTE_STREAM_SPLIT: return true;
     default: return false;
   }
 }
@@ -199,14 +206,16 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                = 0,
-  GENERAL             = (1 << 0),  // Run catch-all decode kernel
-  STRING              = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT    = (1 << 6)   // Run decode kernel for fixed width dictionary pages
+  NONE                   = 0,
+  GENERAL                = (1 << 0),  // Run catch-all decode kernel
+  STRING                 = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -317,8 +326,8 @@ struct PageInfo {
   int32_t skipped_leaf_values;
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
+  size_t str_offset;  // offset into string data for this page
   int32_t str_bytes;
-  int32_t str_offset;   // offset into string data for this page
   bool has_page_index;  // true if str_bytes, num_valids, etc are derivable from page indexes
 
   // nesting information (input/output) for each page. this array contains
@@ -370,8 +379,8 @@ struct ColumnChunkDesc {
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
-                           uint16_t datatype_,
-                           uint16_t datatype_length_,
+                           Type datatype_,
+                           int32_t datatype_length_,
                            size_t start_row_,
                            uint32_t num_rows_,
                            int16_t max_definition_level_,
@@ -379,15 +388,14 @@ struct ColumnChunkDesc {
                            int16_t max_nesting_depth_,
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
-                           int8_t codec_,
-                           int8_t converted_type_,
+                           Compression codec_,
                            thrust::optional<LogicalType> logical_type_,
-                           int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
-                           float list_bytes_per_row_est_)
+                           float list_bytes_per_row_est_,
+                           bool strings_to_categorical_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -395,7 +403,8 @@ struct ColumnChunkDesc {
       num_rows(num_rows_),
       max_level{max_definition_level_, max_repetition_level_},
       max_nesting_depth{max_nesting_depth_},
-      data_type(datatype_ | (datatype_length_ << 3)),
+      type_length(datatype_length_),
+      physical_type(datatype_),
       level_bits{def_level_bits_, rep_level_bits_},
       num_data_pages(0),
       num_dict_pages(0),
@@ -405,14 +414,14 @@ struct ColumnChunkDesc {
       column_data_base{nullptr},
       column_string_base{nullptr},
       codec(codec_),
-      converted_type(converted_type_),
       logical_type(logical_type_),
-      decimal_precision(decimal_precision_),
       ts_clock_rate(ts_clock_rate_),
       src_col_index(src_col_index_),
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
-      list_bytes_per_row_est(list_bytes_per_row_est_)
+      list_bytes_per_row_est(list_bytes_per_row_est_),
+      is_strings_to_cat(strings_to_categorical_),
+      is_large_string_col(false)
   {
   }
 
@@ -423,7 +432,8 @@ struct ColumnChunkDesc {
   uint32_t num_rows{};               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
-  uint16_t data_type{};  // basic column data type, ((type_length << 3) | // parquet::Type)
+  int32_t type_length{};                             // type length from schema (for FLBA only)
+  Type physical_type{};                              // parquet physical data type
   uint8_t
     level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
   int32_t num_data_pages{};                     // number of data pages
@@ -433,10 +443,8 @@ struct ColumnChunkDesc {
   bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
   void** column_data_base{};                     // base pointers of column data
   void** column_string_base{};                   // base pointers of column string data
-  int8_t codec{};                                // compressed codec enum
-  int8_t converted_type{};                       // converted type enum
+  Compression codec{};                           // compressed codec enum
   thrust::optional<LogicalType> logical_type{};  // logical type
-  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
@@ -446,6 +454,9 @@ struct ColumnChunkDesc {
   column_chunk_info const* h_chunk_info{};
 
   float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
+
+  bool is_strings_to_cat{};    // convert strings to hashes
+  bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
 };
 
 /**
@@ -461,6 +472,7 @@ struct chunk_page_info {
 struct parquet_column_device_view : stats_column_desc {
   Type physical_type;            //!< physical data type
   ConvertedType converted_type;  //!< logical data type
+  int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
   constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
@@ -475,6 +487,7 @@ struct parquet_column_device_view : stats_column_desc {
                                //!< nullability of parent_column. May be different from
                                //!< col.nullable() in case of chunked writing.
   bool output_as_byte_array;   //!< Indicates this list column is being written as a byte array
+  bool skip_compression;       //!< Skip compression for this column
   column_encoding requested_encoding;  //!< User specified encoding for this column.
 };
 
@@ -516,11 +529,12 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
  * Used to control which encode kernels to run.
  */
 enum class encode_kernel_mask {
-  PLAIN            = (1 << 0),  // Run plain encoding kernel
-  DICTIONARY       = (1 << 1),  // Run dictionary encoding kernel
-  DELTA_BINARY     = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
-  DELTA_LENGTH_BA  = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
-  DELTA_BYTE_ARRAY = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  PLAIN             = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY        = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY      = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
+  DELTA_LENGTH_BA   = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
+  DELTA_BYTE_ARRAY  = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  BYTE_STREAM_SPLIT = (1 << 5),  // Run plain encoding kernel, but split streams
 };
 
 /**
@@ -615,11 +629,16 @@ struct EncPage {
  */
 constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
-  auto const not_converted_to_decimal = chunk.converted_type != DECIMAL;
+  // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing
+  // a decimal.
+  if (chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL) {
+    return false;
+  }
+
   auto const non_hashed_byte_array =
-    (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4;
-  auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY;
-  return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array);
+    chunk.physical_type == BYTE_ARRAY and not chunk.is_strings_to_cat;
+  auto const fixed_len_byte_array = chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return non_hashed_byte_array or fixed_len_byte_array;
 }
 
 /**
@@ -753,6 +772,28 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                         cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                         size_t num_rows,
+                         size_t min_row,
+                         int level_type_size,
+                         kernel_error::pointer error_code,
+                         rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
@@ -885,6 +926,28 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                             std::size_t num_rows,
+                             size_t min_row,
+                             int level_type_size,
+                             kernel_error::pointer error_code,
+                             rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f43a8fd24c4..0109be661a7 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -29,11 +29,14 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <list>
 #include <numeric>
 #include <optional>
+#include <unordered_set>
 
 namespace cudf::io::parquet::detail {
 
@@ -126,10 +129,10 @@ struct stats_caster {
   // Creates device columns from column statistics (min, max)
   template <typename T>
   std::pair<std::unique_ptr<column>, std::unique_ptr<column>> operator()(
-    size_t col_idx,
+    int schema_idx,
     cudf::data_type dtype,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     // List, Struct, Dictionary types are not supported
     if constexpr (cudf::is_compound<T>() && !std::is_same_v<T, string_view>) {
@@ -165,7 +168,7 @@ struct stats_caster {
 
         static auto make_strings_children(host_span<string_view> host_strings,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
         {
           std::vector<char> chars{};
           std::vector<cudf::size_type> offsets(1, 0);
@@ -182,7 +185,7 @@ struct stats_caster {
 
         auto to_device(cudf::data_type dtype,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
         {
           if constexpr (std::is_same_v<T, string_view>) {
             auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
@@ -205,22 +208,31 @@ struct stats_caster {
       };  // local struct host_column
       host_column min(total_row_groups);
       host_column max(total_row_groups);
-
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
-          auto const& colchunk  = row_group.columns[col_idx];
-          // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
-                                    ? colchunk.meta_data.statistics.min_value
-                                    : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
-                                    ? colchunk.meta_data.statistics.max_value
-                                    : colchunk.meta_data.statistics.max;
-          // translate binary data to Type then to <T>
-          min.set_index(stats_idx, min_value, colchunk.meta_data.type);
-          max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          auto col              = std::find_if(
+            row_group.columns.begin(),
+            row_group.columns.end(),
+            [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
+          if (col != std::end(row_group.columns)) {
+            auto const& colchunk = *col;
+            // To support deprecated min, max fields.
+            auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
+                                      ? colchunk.meta_data.statistics.min_value
+                                      : colchunk.meta_data.statistics.min;
+            auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
+                                      ? colchunk.meta_data.statistics.max_value
+                                      : colchunk.meta_data.statistics.max;
+            // translate binary data to Type then to <T>
+            min.set_index(stats_idx, min_value, colchunk.meta_data.type);
+            max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          } else {
+            // Marking it null, if column present in row group
+            min.set_index(stats_idx, thrust::nullopt, {});
+            max.set_index(stats_idx, thrust::nullopt, {});
+          }
           stats_idx++;
         }
       };
@@ -377,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
@@ -411,7 +424,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   std::vector<std::unique_ptr<column>> columns;
   stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
-    auto const& dtype = output_dtypes[col_idx];
+    auto const schema_idx = output_column_schemas[col_idx];
+    auto const& dtype     = output_dtypes[col_idx];
     // Only comparable types except fixed point are supported.
     if (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING) {
       // placeholder only for unsupported types.
@@ -422,14 +436,14 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
       continue;
     }
     auto [min_col, max_col] =
-      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, col_idx, dtype, stream, mr);
+      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, schema_idx, dtype, stream, mr);
     columns.push_back(std::move(min_col));
     columns.push_back(std::move(max_col));
   }
   auto stats_table = cudf::table(std::move(columns));
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter, static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter stats_expr{filter.get(), static_cast<size_type>(output_dtypes.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
@@ -474,6 +488,20 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
 }
 
 // convert column named expression to column index reference expression
+named_to_reference_converter::named_to_reference_converter(
+  std::optional<std::reference_wrapper<ast::expression const>> expr, table_metadata const& metadata)
+{
+  if (!expr.has_value()) return;
+  // create map for column name.
+  std::transform(metadata.schema_info.cbegin(),
+                 metadata.schema_info.cend(),
+                 thrust::counting_iterator<size_t>(0),
+                 std::inserter(column_name_to_index, column_name_to_index.end()),
+                 [](auto const& sch, auto index) { return std::make_pair(sch.name, index); });
+
+  expr.value().get().accept(*this);
+}
+
 std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
   ast::literal const& expr)
 {
@@ -529,4 +557,82 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
+/**
+ * @brief Converts named columns to index reference columns
+ *
+ */
+class names_from_expression : public ast::detail::expression_transformer {
+ public:
+  names_from_expression(std::optional<std::reference_wrapper<ast::expression const>> expr,
+                        std::vector<std::string> const& skip_names)
+    : _skip_names(skip_names.cbegin(), skip_names.cend())
+  {
+    if (!expr.has_value()) return;
+    expr.value().get().accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override
+  {
+    // collect column names
+    auto col_name = expr.get_column_name();
+    if (_skip_names.count(col_name) == 0) { _column_names.insert(col_name); }
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    visit_operands(expr.get_operands());
+    return expr;
+  }
+
+  /**
+   * @brief Returns the column names in AST.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::vector<std::string> to_vector() &&
+  {
+    return {std::make_move_iterator(_column_names.begin()),
+            std::make_move_iterator(_column_names.end())};
+  }
+
+ private:
+  void visit_operands(std::vector<std::reference_wrapper<ast::expression const>> operands)
+  {
+    for (auto const& operand : operands) {
+      operand.get().accept(*this);
+    }
+  }
+
+  std::unordered_set<std::string> _column_names;
+  std::unordered_set<std::string> _skip_names;
+};
+
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names)
+{
+  return names_from_expression(expr, skip_names).to_vector();
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 17d7c07bc91..8dfd68cd9b8 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "reader_impl.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
@@ -23,31 +25,21 @@ reader::reader() = default;
 reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
                parquet_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
 {
 }
 
 reader::~reader() = default;
 
-table_with_metadata reader::read(parquet_reader_options const& options)
-{
-  // if the user has specified custom row bounds
-  bool const uses_custom_row_bounds =
-    options.get_num_rows().has_value() || options.get_skip_rows() != 0;
-  return _impl->read(options.get_skip_rows(),
-                     options.get_num_rows(),
-                     uses_custom_row_bounds,
-                     options.get_row_groups(),
-                     options.get_filter());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                std::size_t pass_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                parquet_reader_options const& options,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8112328d962..1bd2fae281c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -22,13 +22,31 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
 
-void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows)
+namespace {
+// Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should
+// be treated as a string. Currently the only logical type that has special handling is DECIMAL.
+// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
+// for now would also be treated as a string).
+inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const& logical_type)
+{
+  if (!logical_type.has_value()) { return true; }
+  return logical_type->type != LogicalType::DECIMAL;
+}
+
+}  // namespace
+
+void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -66,10 +84,11 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
     auto const has_flba =
       std::any_of(pass.chunks.begin(), pass.chunks.end(), [](auto const& chunk) {
-        return (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY && chunk.converted_type != DECIMAL;
+        return chunk.physical_type == FIXED_LEN_BYTE_ARRAY and
+               is_treat_fixed_length_as_string(chunk.logical_type);
       });
 
-    if (!_has_page_index || uses_custom_row_bounds || has_flba) {
+    if (!_has_page_index || uses_custom_row_bounds(mode) || has_flba) {
       ComputePageStringSizes(subpass.pages,
                              pass.chunks,
                              delta_temp_buf,
@@ -83,11 +102,21 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     col_string_sizes = calculate_page_string_offsets();
 
     // check for overflow
-    if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) {
-          return sz > std::numeric_limits<size_type>::max();
-        })) {
+    auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
+                                               [=](std::size_t sz) { return sz > threshold; });
+    if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
+
+    // mark any chunks that are large string columns
+    if (has_large_strings) {
+      for (auto& chunk : pass.chunks) {
+        auto const idx = chunk.src_col_index;
+        if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
+      }
+    }
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -237,6 +266,28 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
                       streams[s_idx++]);
   }
 
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
+    DecodeSplitPageDataFlat(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    DecodeSplitPageData(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
@@ -310,11 +361,13 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                      &sz,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        if (sz <= strings::detail::get_offset64_threshold()) {
+          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
+                                        &sz,
+                                        sizeof(size_type),
+                                        cudaMemcpyDefault,
+                                        _stream.value()));
+        }
       }
     }
   }
@@ -348,7 +401,7 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : impl(0 /*chunk_read_limit*/,
          0 /*input_pass_read_limit*/,
          std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
@@ -363,20 +416,20 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : _stream{stream},
     _mr{mr},
+    _options{options.get_timestamp_type(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_row_groups()},
     _sources{std::move(sources)},
     _output_chunk_read_limit{chunk_read_limit},
     _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
-
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
+  _metadata =
+    std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
 
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
@@ -384,24 +437,35 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Binary columns can be read as binary or strings
   _reader_column_schema = options.get_column_schema();
 
-  // Select only columns required by the options
+  // Select only columns required by the options and filter
+  std::optional<std::vector<std::string>> filter_columns_names;
+  if (options.get_filter().has_value() and options.get_columns().has_value()) {
+    // list, struct, dictionary are not supported by AST filter yet.
+    // extract columns not present in get_columns() & keep count to remove at end.
+    filter_columns_names =
+      get_column_names_in_expression(options.get_filter(), *(options.get_columns()));
+    _num_filter_only_columns = filter_columns_names->size();
+  }
   std::tie(_input_columns, _output_buffers, _output_column_schemas) =
     _metadata->select_columns(options.get_columns(),
+                              filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
-                              _timestamp_type.id());
+                              _options.timestamp_type.id());
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
     _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
+
+  // Save the name to reference converter to extract output filter AST in
+  // `preprocess_file()` and `finalize_output()`
+  table_metadata metadata;
+  populate_metadata(metadata);
+  _expr_conv = named_to_reference_converter(options.get_filter(), metadata);
 }
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows,
-                                bool uses_custom_row_bounds,
-                                host_span<std::vector<size_type> const> row_group_indices,
-                                std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::prepare_data(read_mode mode)
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (!_file_preprocessed) {
@@ -409,12 +473,12 @@ void reader::impl::prepare_data(int64_t skip_rows,
     // - read row group information
     // - setup information on (parquet) chunks
     // - compute schedule of input passes
-    preprocess_file(skip_rows, num_rows, row_group_indices, filter);
+    preprocess_file(mode);
   }
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
-  // our current pass)
-  if (_file_itm_data.num_passes() > 0) { handle_chunking(uses_custom_row_bounds); }
+  // our current pass) if in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) { handle_chunking(mode); }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -433,8 +497,7 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
                                      out_metadata.per_file_user_data[0].end()};
 }
 
-table_with_metadata reader::impl::read_chunk_internal(
-  bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
 {
   // If `_output_metadata` has been constructed, just copy it over.
   auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
@@ -445,28 +508,32 @@ table_with_metadata reader::impl::read_chunk_internal(
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns, filter); }
+  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
   auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
-  allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
+  allocate_columns(mode, read_info.skip_rows, read_info.num_rows);
 
   // Parse data into the output buffers.
-  decode_page_data(uses_custom_row_bounds, read_info.skip_rows, read_info.num_rows);
+  decode_page_data(mode, read_info.skip_rows, read_info.num_rows);
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto metadata      = _reader_column_schema.has_value()
-                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                           : std::nullopt;
-    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
-    // FIXED_LEN_BYTE_ARRAY never read as string
-    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+    auto metadata           = _reader_column_schema.has_value()
+                                ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                                : std::nullopt;
+    auto const& schema      = _metadata->get_schema(_output_column_schemas[i]);
+    auto const logical_type = schema.logical_type.value_or(LogicalType{});
+    // FIXED_LEN_BYTE_ARRAY never read as string.
+    // TODO: if we ever decide that the default reader behavior is to treat unannotated BINARY as
+    // binary and not strings, this test needs to change.
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and logical_type.type != LogicalType::DECIMAL) {
       metadata = std::make_optional<reader_column_schema>();
       metadata->set_convert_binary_to_strings(false);
+      metadata->set_type_length(schema.type_length);
     }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
@@ -478,13 +545,11 @@ table_with_metadata reader::impl::read_chunk_internal(
   }
 
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns, filter);
+  return finalize_output(out_metadata, out_columns);
 }
 
-table_with_metadata reader::impl::finalize_output(
-  table_metadata& out_metadata,
-  std::vector<std::unique_ptr<column>>& out_columns,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+                                                  std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
@@ -502,74 +567,70 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
-  // advance output chunk/subpass/pass info
-  if (_file_itm_data.num_passes() > 0) {
+  // advance output chunk/subpass/pass info for non-empty tables if and only if we are in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
     auto& pass    = *_pass_itm_data;
     auto& subpass = *pass.subpass;
     subpass.current_output_chunk++;
-    _file_itm_data._output_chunk_count++;
   }
 
-  if (filter.has_value()) {
+  // increment the output chunk count
+  _file_itm_data._output_chunk_count++;
+
+  // check if the output filter AST expression (= _expr_conv.get_converted_expr()) exists
+  if (_expr_conv.get_converted_expr().has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
-    auto predicate  = cudf::detail::compute_column(
-      *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
+    auto predicate  = cudf::detail::compute_column(*read_table,
+                                                  _expr_conv.get_converted_expr().value().get(),
+                                                  _stream,
+                                                  rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
-    auto output_table = cudf::detail::apply_boolean_mask(*read_table, *predicate, _stream, _mr);
+    // Exclude columns present in filter only in output
+    auto counting_it        = thrust::make_counting_iterator<std::size_t>(0);
+    auto const output_count = read_table->num_columns() - _num_filter_only_columns;
+    auto only_output        = read_table->select(counting_it, counting_it + output_count);
+    auto output_table = cudf::detail::apply_boolean_mask(only_output, *predicate, _stream, _mr);
+    if (_num_filter_only_columns > 0) { out_metadata.schema_info.resize(output_count); }
     return {std::move(output_table), std::move(out_metadata)};
   }
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-table_with_metadata reader::impl::read(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  bool uses_custom_row_bounds,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read()
 {
   CUDF_EXPECTS(_output_chunk_read_limit == 0,
                "Reading the whole file must not have non-zero byte_limit.");
-  table_metadata metadata;
-  populate_metadata(metadata);
-  auto expr_conv     = named_to_reference_converter(filter, metadata);
-  auto output_filter = expr_conv.get_converted_expr();
 
-  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, output_filter);
-  return read_chunk_internal(uses_custom_row_bounds, output_filter);
+  prepare_data(read_mode::READ_ALL);
+  return read_chunk_internal(read_mode::READ_ALL);
 }
 
 table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_file_itm_data._output_chunk_count > 0) {
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes() and
+      not is_first_output_chunk()) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
-  return read_chunk_internal(true, std::nullopt);
+  prepare_data(read_mode::CHUNKED_READ);
+  return read_chunk_internal(read_mode::CHUNKED_READ);
 }
 
 bool reader::impl::has_next()
 {
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
+  prepare_data(read_mode::CHUNKED_READ);
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
-  return has_more_work();
+  // if not has_more_work then check if this is the first pass in an empty
+  // table and return true so it could be read once.
+  return has_more_work() or is_first_output_chunk();
 }
 
 namespace {
@@ -587,13 +648,17 @@ parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
 {
+  // do not use arrow schema when reading information from parquet metadata.
+  static constexpr auto use_arrow_schema = false;
+
   // Open and parse the source dataset metadata
-  auto metadata = aggregate_reader_metadata(sources);
+  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema);
 
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
                           metadata.get_num_row_groups(),
-                          metadata.get_key_value_metadata()[0]};
+                          metadata.get_key_value_metadata()[0],
+                          metadata.get_rowgroup_metadata()};
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 185419a5b46..3b8e80a29e6 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -31,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -57,25 +59,14 @@ class reader::impl {
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
-   *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows,
-                           bool uses_custom_row_bounds,
-                           host_span<std::vector<size_type> const> row_group_indices,
-                           std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read();
 
   /**
    * @brief Constructor from a chunk read limit and an array of dataset sources with reader options.
@@ -88,6 +79,13 @@ class reader::impl {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * Reading the whole given file at once through `read()` function is still supported if
@@ -108,7 +106,7 @@ class reader::impl {
                 std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @copydoc cudf::io::chunked_parquet_reader::has_next
@@ -123,21 +121,17 @@ class reader::impl {
   // top level functions involved with ratcheting through the passes, subpasses
   // and output chunks of the read process
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform the necessary data preprocessing for parsing file later on.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read, or `std::nullopt` to read all rows
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
-   * @param filter Optional AST expression to filter row groups based on column chunk statistics
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows,
-                    bool uses_custom_row_bounds,
-                    host_span<std::vector<size_type> const> row_group_indices,
-                    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void prepare_data(read_mode mode);
 
   /**
    * @brief Preprocess step for the entire file.
@@ -145,23 +139,16 @@ class reader::impl {
    * Only ever called once. This function reads in rowgroup and associated chunk
    * information and computes the schedule of top level passes (see `pass_intermediate_data`).
    *
-   * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
-   * @param num_rows The total number of rows to read out of the selected rowgroups
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void preprocess_file(int64_t skip_rows,
-                       std::optional<size_type> const& num_rows,
-                       host_span<std::vector<size_type> const> row_group_indices,
-                       std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void preprocess_file(read_mode mode);
 
   /**
    * @brief Ratchet the pass/subpass/chunk process forward.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specified
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void handle_chunking(bool uses_custom_row_bounds);
+  void handle_chunking(read_mode mode);
 
   /**
    * @brief Setup step for the next input read pass.
@@ -169,36 +156,31 @@ class reader::impl {
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void setup_next_pass(bool uses_custom_row_bounds);
+  void setup_next_pass(read_mode mode);
 
   /**
    * @brief Setup step for the next decompression subpass.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   *
    * A 'subpass' is defined as a subset of pages within a pass that are
    * decompressed and decoded as a batch. Subpasses may be further subdivided
    * into output chunks.
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   *
    */
-  void setup_next_subpass(bool uses_custom_row_bounds);
+  void setup_next_subpass(read_mode mode);
 
   /**
    * @brief Read a chunk of data and return an output table.
    *
    * This function is called internally and expects all preprocessing steps have already been done.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal(
-    bool uses_custom_row_bounds,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read_chunk_internal(read_mode mode);
 
   // utility functions
  private:
@@ -244,12 +226,11 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
+  void preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -283,23 +264,19 @@ class reader::impl {
    *
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
-   * @param filter Optional AST expression to filter output rows
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(
-    table_metadata& out_metadata,
-    std::vector<std::unique_ptr<column>>& out_columns,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata finalize_output(table_metadata& out_metadata,
+                                      std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
    * @brief Allocate data buffers for the output columns.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Crop all rows below skip_rows
    * @param num_rows Maximum number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
    */
-  void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
+  void allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Calculate per-page offsets for string data
@@ -311,12 +288,11 @@ class reader::impl {
   /**
    * @brief Converts the page data and outputs to columns.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Minimum number of rows from start
    * @param num_rows Number of rows to output
    */
-  void decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows);
+  void decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Creates file-wide parquet chunk information.
@@ -345,8 +321,41 @@ class reader::impl {
   }
 
  private:
+  /**
+   * @brief Check if the user has specified custom row bounds
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   * @return True if the user has specified custom row bounds
+   */
+  [[nodiscard]] bool uses_custom_row_bounds(read_mode mode) const
+  {
+    // TODO: `read_mode` is hardcoded to `true` when `read_mode::CHUNKED_READ` to enforce
+    // `ComputePageSizes()` computation for all remaining chunks.
+    return (mode == read_mode::READ_ALL)
+             ? (_options.num_rows.has_value() or _options.skip_rows != 0)
+             : true;
+  }
+
+  [[nodiscard]] bool is_first_output_chunk() const
+  {
+    return _file_itm_data._output_chunk_count == 0;
+  }
+
   rmm::cuda_stream_view _stream;
-  rmm::mr::device_memory_resource* _mr = nullptr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+
+  // Reader configs.
+  struct {
+    // timestamp_type
+    data_type timestamp_type{type_id::EMPTY};
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_rows;
+    std::vector<std::vector<size_type>> row_group_indices;
+  } const _options;
+
+  // name to reference converter to extract AST output filter
+  named_to_reference_converter _expr_conv{std::nullopt, table_metadata{}};
 
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
@@ -366,13 +375,15 @@ class reader::impl {
   // _output_buffers associated metadata
   std::unique_ptr<table_metadata> _output_metadata;
 
+  // number of extra filter columns
+  std::size_t _num_filter_only_columns{0};
+
   bool _strings_to_categorical = false;
 
   // are there usable page indexes available
   bool _has_page_index = false;
 
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
-  data_type _timestamp_type{type_id::EMPTY};
 
   // chunked reading happens in 2 parts:
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 5c387147e4b..d3f321af0bd 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
@@ -101,7 +102,7 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
       printf("\tP %s: {%lu, %lu, %lu}\n",
              is_list ? "(L)" : "",
              pidx,
-             c_info[pidx].row_index,
+             c_info[pidx].end_row_index,
              c_info[pidx].size_bytes);
     }
   }
@@ -121,16 +122,17 @@ void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
   printf("------------\nCumulative sizes %s (index, row_index, size_bytes, page_key)\n",
          label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %lu, %d}", idx, sizes[idx].row_index, sizes[idx].size_bytes, sizes[idx].key);
+    printf(
+      "{%lu, %lu, %lu, %d}", idx, sizes[idx].end_row_index, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start             = thrust::make_transform_iterator(splits->begin(),
                                                    [](row_range const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_index);
+      auto split             = std::find(start, end, sizes[idx].end_row_index);
       auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_index > sizes[idx].row_index))) {
+        if (split != end && ((idx == sizes.size() - 1) ||
+                             (sizes[idx + 1].end_row_index > sizes[idx].end_row_index))) {
           return static_cast<int>(std::distance(start, split));
         }
         return idx == 0 ? 0 : -1;
@@ -259,8 +261,9 @@ struct set_row_index {
     auto const& page          = pages[i];
     auto const& chunk         = chunks[page.chunk_idx];
     size_t const page_end_row = chunk.start_row + page.chunk_row + page.num_rows;
-    // if we have been passed in a cap, apply it
-    c_info[i].end_row_index = max_row > 0 ? min(max_row, page_end_row) : page_end_row;
+    // this cap is necessary because in the chunked reader, we use estimations for the row
+    // counts for list columns, which can result in values > than the absolute number of rows.
+    c_info[i].end_row_index = min(max_row, page_end_row);
   }
 };
 
@@ -364,33 +367,28 @@ int64_t find_next_split(int64_t cur_pos,
 /**
  * @brief Converts cuDF units to Parquet units.
  *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(
+[[nodiscard]] std::tuple<int32_t, thrust::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  thrust::optional<ConvertedType> converted,
-  int32_t length)
+  thrust::optional<LogicalType> logical_type)
 {
-  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
+  int32_t const clock_rate =
+    is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
+
+  // TODO(ets): this is leftover from the original code, but will we ever output decimal as
+  // anything but fixed point?
+  if (logical_type.has_value() and logical_type->type == LogicalType::DECIMAL) {
+    // if decimal but not outputting as float or decimal, then convert to no logical type
+    if (column_type_id != type_id::FLOAT64 and
+        not cudf::is_fixed_point(data_type{column_type_id})) {
+      return std::make_tuple(clock_rate, thrust::nullopt);
+    }
   }
 
-  int8_t converted_type = converted.value_or(UNKNOWN);
-  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
+  return std::make_tuple(clock_rate, std::move(logical_type));
 }
 
 /**
@@ -466,6 +464,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
                                                      thrust::make_discard_iterator(),
                                                      key_offsets.begin())
                                  .second;
+
   size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
   thrust::exclusive_scan(
     rmm::exec_policy_nosync(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
@@ -1149,12 +1148,12 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
 
 }  // anonymous namespace
 
-void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+void reader::impl::handle_chunking(read_mode mode)
 {
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
     // setup the next pass
-    setup_next_pass(uses_custom_row_bounds);
+    setup_next_pass(mode);
   }
 
   auto& pass = *_pass_itm_data;
@@ -1182,15 +1181,15 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
       if (_file_itm_data._current_input_pass == _file_itm_data.num_passes()) { return; }
 
       // setup the next pass
-      setup_next_pass(uses_custom_row_bounds);
+      setup_next_pass(mode);
     }
   }
 
   // setup the next sub pass
-  setup_next_subpass(uses_custom_row_bounds);
+  setup_next_subpass(mode);
 }
 
-void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_pass(read_mode mode)
 {
   auto const num_passes = _file_itm_data.num_passes();
 
@@ -1261,7 +1260,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     detect_malformed_pages(
       pass.pages,
       pass.chunks,
-      uses_custom_row_bounds ? std::nullopt : std::make_optional(pass.num_rows),
+      uses_custom_row_bounds(mode) ? std::nullopt : std::make_optional(pass.num_rows),
       _stream);
 
     // decompress dictionary data if applicable.
@@ -1297,10 +1296,12 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     printf("\tnum_rows: %'lu\n", pass.num_rows);
     printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
     auto const num_columns = _input_columns.size();
+    std::vector<size_type> h_page_offsets =
+      cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
     for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
       printf("\t\tColumn %'lu: num_pages(%'d)\n",
              c_idx,
-             pass.page_offsets[c_idx + 1] - pass.page_offsets[c_idx]);
+             h_page_offsets[c_idx + 1] - h_page_offsets[c_idx]);
     }
 #endif
 
@@ -1308,7 +1309,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
   }
 }
 
-void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_subpass(read_mode mode)
 {
   auto& pass    = *_pass_itm_data;
   pass.subpass  = std::make_unique<subpass_intermediate_data>();
@@ -1367,11 +1368,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     // can be considerable.
     include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
 
-    auto iter = thrust::make_counting_iterator(0);
+    auto iter               = thrust::make_counting_iterator(0);
+    auto const pass_max_row = pass.skip_rows + pass.num_rows;
     thrust::for_each(rmm::exec_policy_nosync(_stream),
                      iter,
                      iter + pass.pages.size(),
-                     set_row_index{pass.chunks, pass.pages, c_info, 0});
+                     set_row_index{pass.chunks, pass.pages, c_info, pass_max_row});
     // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
 
     // get the next batch of pages
@@ -1442,7 +1444,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   // preprocess pages (computes row counts for lists, computes output chunks and computes
   // the actual row counts we will be able load out of this subpass)
-  preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+  preprocess_subpass_pages(mode, _output_chunk_read_limit);
 
 #if defined(PARQUET_CHUNK_LOGGING)
   printf("\tSubpass: skip_rows(%'lu), num_rows(%'lu), remaining read limit(%'lu)\n",
@@ -1453,11 +1455,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   printf("\t\tTotal expected usage: %'lu\n",
          total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
                                   : total_expected_size + pass.base_mem_size);
+  std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
-           page_indices[c_idx].start,
-           page_indices[c_idx].end);
+           h_page_indices[c_idx].start,
+           h_page_indices[c_idx].end);
   }
   printf("\t\tOutput chunks:\n");
   for (size_t idx = 0; idx < subpass.output_chunk_read_info.size(); idx++) {
@@ -1515,12 +1518,11 @@ void reader::impl::create_global_chunk_info()
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
       auto& schema   = _metadata->get_schema(col.schema_idx);
 
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
+      auto [clock_rate, logical_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
+                        _options.timestamp_type.id(),
                         schema.type,
-                        schema.converted_type,
-                        schema.type_length);
+                        schema.logical_type);
 
       // for lists, estimate the number of bytes per row. this is used by the subpass reader to
       // determine where to split the decompression boundaries
@@ -1538,7 +1540,7 @@ void reader::impl::create_global_chunk_info()
                                        nullptr,
                                        col_meta.num_values,
                                        schema.type,
-                                       type_width,
+                                       schema.type_length,
                                        row_group_start,
                                        row_group_rows,
                                        schema.max_definition_level,
@@ -1547,14 +1549,13 @@ void reader::impl::create_global_chunk_info()
                                        required_bits(schema.max_definition_level),
                                        required_bits(schema.max_repetition_level),
                                        col_meta.codec,
-                                       converted_type,
-                                       schema.logical_type,
-                                       schema.decimal_precision,
+                                       logical_type,
                                        clock_rate,
                                        i,
                                        col.schema_idx,
                                        chunk_info,
-                                       list_bytes_per_row_est));
+                                       list_bytes_per_row_est,
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
     }
 
     remaining_rows -= row_group_rows;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 776caa99ac9..eb653c6b9ac 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -16,53 +16,55 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include "compact_protocol_reader.hpp"
+#include "io/parquet/parquet.hpp"
+#include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/row_selection.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
 
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <functional>
 #include <numeric>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
 
+namespace flatbuf = cudf::io::parquet::flatbuf;
+
 namespace {
 
-ConvertedType logical_type_to_converted_type(thrust::optional<LogicalType> const& logical)
+thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
-  if (not logical.has_value()) { return UNKNOWN; }
-  switch (logical->type) {
-    case LogicalType::STRING: return UTF8;
-    case LogicalType::MAP: return MAP;
-    case LogicalType::LIST: return LIST;
-    case LogicalType::ENUM: return ENUM;
-    case LogicalType::DECIMAL: return DECIMAL;  // TODO use decimal scale/precision
-    case LogicalType::DATE: return DATE;
-    case LogicalType::TIME:
-      if (logical->is_time_millis()) {
-        return TIME_MILLIS;
-      } else if (logical->is_time_micros()) {
-        return TIME_MICROS;
-      }
-      break;
-    case LogicalType::TIMESTAMP:
-      if (logical->is_timestamp_millis()) {
-        return TIMESTAMP_MILLIS;
-      } else if (logical->is_timestamp_micros()) {
-        return TIMESTAMP_MICROS;
-      }
-      break;
-    case LogicalType::INTEGER:
-      switch (logical->bit_width()) {
-        case 8: return logical->is_signed() ? INT_8 : UINT_8;
-        case 16: return logical->is_signed() ? INT_16 : UINT_16;
-        case 32: return logical->is_signed() ? INT_32 : UINT_32;
-        case 64: return logical->is_signed() ? INT_64 : UINT_64;
-        default: break;
-      }
-    case LogicalType::UNKNOWN: return NA;
-    case LogicalType::JSON: return JSON;
-    case LogicalType::BSON: return BSON;
-    default: break;
+  if (schema.converted_type.has_value()) {
+    switch (schema.converted_type.value()) {
+      case ENUM:  // treat ENUM as UTF8 string
+      case UTF8: return LogicalType{LogicalType::STRING};
+      case MAP: return LogicalType{LogicalType::MAP};
+      case LIST: return LogicalType{LogicalType::LIST};
+      case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}};
+      case DATE: return LogicalType{LogicalType::DATE};
+      case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}};
+      case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}};
+      case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}};
+      case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, TimeUnit::MICROS}};
+      case UINT_8: return LogicalType{IntType{8, false}};
+      case UINT_16: return LogicalType{IntType{16, false}};
+      case UINT_32: return LogicalType{IntType{32, false}};
+      case UINT_64: return LogicalType{IntType{64, false}};
+      case INT_8: return LogicalType{IntType{8, true}};
+      case INT_16: return LogicalType{IntType{16, true}};
+      case INT_32: return LogicalType{IntType{32, true}};
+      case INT_64: return LogicalType{IntType{64, true}};
+      case JSON: return LogicalType{LogicalType::JSON};
+      case BSON: return LogicalType{LogicalType::BSON};
+      case INTERVAL:  // there is no logical type for INTERVAL yet
+      default: return LogicalType{LogicalType::UNDEFINED};
+    }
   }
-  return UNKNOWN;
+  return thrust::nullopt;
 }
 
 }  // namespace
@@ -74,76 +76,101 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  auto const physical       = schema.type;
-  auto const logical_type   = schema.logical_type;
-  auto converted_type       = schema.converted_type;
-  int32_t decimal_precision = schema.decimal_precision;
-
-  // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to
-  // converted_type if logical_type isn't set
-  // Logical type used for actual data interpretation; the legacy converted type
-  // is superseded by 'logical' type whenever available.
-  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); }
-
-  switch (converted_type.value_or(UNKNOWN)) {
-    case UINT_8: return type_id::UINT8;
-    case INT_8: return type_id::INT8;
-    case UINT_16: return type_id::UINT16;
-    case INT_16: return type_id::INT16;
-    case UINT_32: return type_id::UINT32;
-    case UINT_64: return type_id::UINT64;
-    case DATE: return type_id::TIMESTAMP_DAYS;
-    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case TIMESTAMP_MILLIS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MILLISECONDS;
-    case TIMESTAMP_MICROS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MICROSECONDS;
-    case DECIMAL:
-      if (physical == INT32) { return type_id::DECIMAL32; }
-      if (physical == INT64) { return type_id::DECIMAL64; }
-      if (physical == FIXED_LEN_BYTE_ARRAY) {
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
-          return type_id::DECIMAL32;
+  auto const physical_type = schema.type;
+  auto const arrow_type    = schema.arrow_type;
+  auto logical_type        = schema.logical_type;
+
+  // sanity check, but not worth failing over
+  if (schema.converted_type.has_value() and not logical_type.has_value()) {
+    CUDF_LOG_WARN("ConvertedType is specified but not LogicalType");
+    logical_type = converted_to_logical_type(schema);
+  }
+
+  // check if have set the type through arrow schema?
+  if (arrow_type.has_value()) {
+    // is it duration type? i.e. phyical_type == INT64 and no logical/converted types
+    if (physical_type == Type::INT64 and not logical_type.has_value()) {
+      return arrow_type.value();
+    }
+    // should warn but not fail.
+    CUDF_LOG_WARN("Indeterminable arrow type encountered");
+  }
+
+  if (logical_type.has_value()) {
+    switch (logical_type->type) {
+      case LogicalType::INTEGER: {
+        auto const is_signed = logical_type->is_signed();
+        switch (logical_type->bit_width()) {
+          case 8: return is_signed ? type_id::INT8 : type_id::UINT8;
+          case 16: return is_signed ? type_id::INT16 : type_id::UINT16;
+          case 32: return is_signed ? type_id::INT32 : type_id::UINT32;
+          case 64: return is_signed ? type_id::INT64 : type_id::UINT64;
+          default: CUDF_FAIL("Invalid integer bitwidth");
         }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
-          return type_id::DECIMAL64;
+      } break;
+
+      case LogicalType::DATE: return type_id::TIMESTAMP_DAYS;
+
+      case LogicalType::TIME:
+        if (logical_type->is_time_millis()) {
+          return type_id::DURATION_MILLISECONDS;
+        } else if (logical_type->is_time_micros()) {
+          return type_id::DURATION_MICROSECONDS;
+        } else if (logical_type->is_time_nanos()) {
+          return type_id::DURATION_NANOSECONDS;
         }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
-          return type_id::DECIMAL128;
+        break;
+
+      case LogicalType::TIMESTAMP:
+        if (timestamp_type_id != type_id::EMPTY) {
+          return timestamp_type_id;
+        } else if (logical_type->is_timestamp_millis()) {
+          return type_id::TIMESTAMP_MILLISECONDS;
+        } else if (logical_type->is_timestamp_micros()) {
+          return type_id::TIMESTAMP_MICROSECONDS;
+        } else if (logical_type->is_timestamp_nanos()) {
+          return type_id::TIMESTAMP_NANOSECONDS;
         }
-      }
-      if (physical == BYTE_ARRAY) {
-        CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
-        if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+
+      case LogicalType::DECIMAL: {
+        int32_t const decimal_precision = logical_type->precision();
+        if (physical_type == INT32) {
           return type_id::DECIMAL32;
-        } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+        } else if (physical_type == INT64) {
           return type_id::DECIMAL64;
+        } else if (physical_type == FIXED_LEN_BYTE_ARRAY) {
+          if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
+            return type_id::DECIMAL32;
+          } else if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
+            return type_id::DECIMAL64;
+          } else if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
+            return type_id::DECIMAL128;
+          }
+        } else if (physical_type == BYTE_ARRAY) {
+          CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
+          if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+            return type_id::DECIMAL32;
+          } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+            return type_id::DECIMAL64;
+          } else {
+            return type_id::DECIMAL128;
+          }
         } else {
-          return type_id::DECIMAL128;
+          CUDF_FAIL("Invalid representation of decimal type");
         }
-      }
-      CUDF_FAIL("Invalid representation of decimal type");
-      break;
-
-    // maps are just List<Struct<>>.
-    case MAP:
-    case LIST: return type_id::LIST;
-    case NA: return type_id::STRING;
-    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
-    default: break;
-  }
+      } break;
 
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) {
-    if (logical_type->is_timestamp_nanos()) {
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    } else if (logical_type->is_time_nanos()) {
-      return type_id::DURATION_NANOSECONDS;
+      // maps are just List<Struct<>>.
+      case LogicalType::MAP:
+      case LogicalType::LIST: return type_id::LIST;
+
+      // All null column that can't have its type deduced.
+      // Note: originally LogicalType::UNKNOWN was converted to ConvertedType::NA, and
+      // NA then became type_id::STRING, but with the following TODO:
+      // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
+      case LogicalType::UNKNOWN: return type_id::STRING;
+
+      default: break;
     }
   }
 
@@ -152,16 +179,17 @@ type_id to_type_id(SchemaElement const& schema,
 
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
-  switch (physical) {
+  switch (physical_type) {
     case BOOLEAN: return type_id::BOOL8;
     case INT32: return type_id::INT32;
     case INT64: return type_id::INT64;
     case FLOAT: return type_id::FLOAT32;
     case DOUBLE: return type_id::FLOAT64;
     case BYTE_ARRAY:
-    case FIXED_LEN_BYTE_ARRAY:
-      // Can be mapped to INT32 (32-bit hash) or STRING
-      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
+      // strings can be mapped to a 32-bit hash
+      if (strings_to_categorical) { return type_id::INT32; }
+      [[fallthrough]];
+    case FIXED_LEN_BYTE_ARRAY: return type_id::STRING;
     case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
@@ -199,7 +227,6 @@ void metadata::sanitize_schema()
   // This code attempts to make this less messy for the code that follows.
 
   std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
-    if (schema_idx < 0) { return; }
     auto& schema_elem = schema[schema_idx];
     if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
       auto const parent_type = schema[schema_elem.parent_idx].converted_type;
@@ -208,6 +235,7 @@ void metadata::sanitize_schema()
         // This is a list of structs, so we need to mark this as a list, but also
         // add a struct child and move this element's children to the struct
         schema_elem.converted_type  = LIST;
+        schema_elem.logical_type    = LogicalType::LIST;
         schema_elem.repetition_type = OPTIONAL;
         auto const struct_node_idx  = static_cast<size_type>(schema.size());
 
@@ -216,7 +244,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = UNKNOWN;
+        struct_elem.converted_type  = thrust::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
@@ -238,6 +266,11 @@ void metadata::sanitize_schema()
       }
     }
 
+    // convert ConvertedType to LogicalType for older files
+    if (schema_elem.converted_type.has_value() and not schema_elem.logical_type.has_value()) {
+      schema_elem.logical_type = converted_to_logical_type(schema_elem);
+    }
+
     for (auto& child_idx : schema_elem.children_idx) {
       process(child_idx);
     }
@@ -504,7 +537,7 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  host_span<std::unique_ptr<datasource> const> sources)
+  host_span<std::unique_ptr<datasource> const> sources, bool use_arrow_schema)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
     num_rows(calc_num_rows()),
@@ -525,6 +558,307 @@ aggregate_reader_metadata::aggregate_reader_metadata(
       CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
   }
+
+  // Collect and apply arrow:schema from Parquet's key value metadata section
+  if (use_arrow_schema) { apply_arrow_schema(); }
+
+  // Erase "ARROW:schema" from the output pfm if exists
+  std::for_each(
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+}
+
+arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
+{
+  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Function to convert from flatbuf::duration type to cudf::type_id
+  auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
+    // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
+    // void ptr and typecast it to the corresponding type based on the type_id.
+    auto fb_unit = duration->unit();
+    switch (fb_unit) {
+      case flatbuf::TimeUnit::TimeUnit_SECOND:
+        return cudf::data_type{cudf::type_id::DURATION_SECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MILLISECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MILLISECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MICROSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MICROSECONDS};
+      case flatbuf::TimeUnit::TimeUnit_NANOSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_NANOSECONDS};
+      default: return cudf::data_type{};
+    }
+  };
+
+  // variable that tracks if an arrow_type specific column is seen
+  // in the walk
+  bool arrow_type_col_seen = false;
+
+  // Lambda function to walk a field and its children in DFS manner and
+  // return boolean walk success status
+  std::function<bool(flatbuf::Field const* const, arrow_schema_data_types&)> walk_field =
+    [&walk_field, &duration_from_flatbuffer, &arrow_type_col_seen](
+      flatbuf::Field const* const field, arrow_schema_data_types& schema_elem) {
+      // DFS: recursively walk over the children first
+      auto const field_children = field->children();
+
+      if (field_children != nullptr) {
+        auto schema_children = std::vector<arrow_schema_data_types>(field->children()->size());
+
+        if (not std::all_of(
+              thrust::make_counting_iterator(0),
+              thrust::make_counting_iterator(static_cast<int32_t>(field_children->size())),
+              [&](auto const& idx) {
+                return walk_field((*field_children)[idx], schema_children[idx]);
+              })) {
+          return false;
+        }
+        // arrow and parquet schemas are structured slightly differently for list type fields. list
+        // type fields in arrow are structured as: "field:list<element>" vs structured as:
+        // "field:list.element" in Parquet. To handle this, whenever we encounter a list type field,
+        // we add a dummy node "field.list" to the end of current children and move the current
+        // children (".element") to it.
+        switch (field->type_type()) {
+          case flatbuf::Type::Type_List:
+          case flatbuf::Type::Type_LargeList:
+          case flatbuf::Type::Type_FixedSizeList:
+            schema_elem.children.emplace_back(arrow_schema_data_types{std::move(schema_children)});
+            break;
+          default: schema_elem.children = std::move(schema_children); break;
+        }
+      }
+
+      // Walk the field itself
+      if (field->type_type() == flatbuf::Type::Type_Duration) {
+        auto type_data = field->type_as_Duration();
+        if (type_data != nullptr) {
+          auto name = (field->name()) ? field->name()->str() : "";
+          // set the schema_elem type to duration type
+          schema_elem.type = duration_from_flatbuffer(type_data);
+          arrow_type_col_seen |= (schema_elem.type.id() != type_id::EMPTY);
+        } else {
+          CUDF_LOG_ERROR("Parquet reader encountered an invalid type_data pointer.",
+                         "arrow:schema not processed.");
+          return false;
+        }
+      }
+      return true;
+    };
+
+  // TODO: Should we check if any file has the "ARROW:schema" key
+  // Or if all files have the same "ARROW:schema"?
+  auto const it = keyval_maps[0].find("ARROW:schema");
+  if (it == keyval_maps[0].end()) { return {}; }
+
+  // Decode the base64 encoded ipc message string
+  // Note: Store the output from base64_decode in the lvalue here and then pass
+  // it to decode_ipc_message. Directly passing rvalue from base64_decode to
+  // decode_ipc_message can lead to unintended nullptr dereferences.
+  auto const decoded_message = cudf::io::detail::base64_decode(it->second);
+
+  // Decode the ipc message to get an optional string_view of the ipc:Message flatbuffer
+  auto const metadata_buf = decode_ipc_message(decoded_message);
+
+  // Check if the string_view exists
+  if (not metadata_buf.has_value()) {
+    // No need to re-log error here as already logged inside decode_ipc_message
+    return {};
+  }
+
+  // Check if the decoded Message flatbuffer is valid
+  if (flatbuf::GetMessage(metadata_buf.value().data()) == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid ipc:Message flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Check if the Message flatbuffer has a valid arrow:schema in its header
+  if (flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema() == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid arrow:schema flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Get the vector of fields from arrow:schema flatbuffer object
+  auto const fields =
+    flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema()->fields();
+  if (fields == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid fields pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // arrow schema structure to return
+  arrow_schema_data_types schema;
+
+  // Recursively walk the arrow schema and set cudf::data_type for all duration columns
+  if (fields->size() > 0) {
+    schema.children = std::vector<arrow_schema_data_types>(fields->size());
+
+    if (not std::all_of(
+          thrust::make_counting_iterator(0),
+          thrust::make_counting_iterator(static_cast<int32_t>(fields->size())),
+          [&](auto const& idx) { return walk_field((*fields)[idx], schema.children[idx]); })) {
+      return {};
+    }
+
+    // if no arrow type column seen, return nullopt.
+    if (not arrow_type_col_seen) { return {}; }
+  }
+
+  return schema;
+}
+
+void aggregate_reader_metadata::apply_arrow_schema()
+{
+  // Collect the arrow schema from the key value section of Parquet metadata
+  auto arrow_schema_root = collect_arrow_schema();
+
+  // Check if empty arrow schema collected
+  if (arrow_schema_root.type.id() == type_id::EMPTY and arrow_schema_root.children.size() == 0) {
+    return;
+  }
+
+  // Function to verify equal num_children at each level in Parquet and arrow schemas.
+  std::function<bool(arrow_schema_data_types const&, int const)> validate_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+
+      // ensure equal number of children first to avoid any segfaults in children
+      if (pq_schema_elem.num_children == static_cast<int32_t>(arrow_schema.children.size())) {
+        // true if and only if true for all children as well
+        return std::all_of(thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+                           [&](auto const& elem) {
+                             return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+                           });
+      } else {
+        return false;
+      }
+    };
+
+  // Function to co-walk arrow and parquet schemas
+  std::function<void(arrow_schema_data_types const&, int const)> co_walk_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+      std::for_each(
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+        [&](auto const& elem) { co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+      // true for DurationType columns only for now.
+      if (arrow_schema.type.id() != type_id::EMPTY) {
+        pq_schema_elem.arrow_type = arrow_schema.type.id();
+      }
+    };
+
+  // Get Parquet schema root
+  auto pq_schema_root = get_schema(0);
+
+  // verify equal number of children for both schemas at root level
+  if (pq_schema_root.num_children != static_cast<int32_t>(arrow_schema_root.children.size())) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // zip iterator to validate and co-walk the two schemas
+  auto schemas = thrust::make_zip_iterator(
+    thrust::make_tuple(arrow_schema_root.children.begin(), pq_schema_root.children_idx.begin()));
+
+  // Verify equal number of children at all sub-levels
+  if (not std::all_of(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+        return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+      })) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // All good, now co-walk schemas
+  std::for_each(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+    co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+  });
+}
+
+std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
+  std::string_view const serialized_message) const
+{
+  // Constants copied from arrow source and renamed to match the case
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
+
+  // message buffer
+  auto message_buf = serialized_message.data();
+  // current message (buffer) size
+  auto message_size = static_cast<int32_t>(serialized_message.size());
+
+  // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update
+  // buffer pointer and size
+  auto read_int32_from_ipc_message = [&]() {
+    int32_t bytes;
+    std::memcpy(&bytes, message_buf, sizeof(int32_t));
+    // Offset the message buf and reduce remaining size
+    message_buf += sizeof(int32_t);
+    message_size -= sizeof(int32_t);
+    return bytes;
+  };
+
+  // Check for empty message
+  if (message_size == 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered zero length arrow:schema.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the first 4 bytes (continuation) of the ipc message
+  // and check if it matches the expected token
+  if (read_int32_from_ipc_message() != IPC_CONTINUATION_TOKEN) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected IPC continuation token.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size after the continuation bytes.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the next 4 bytes (metadata_len) of the ipc message
+  // and check if invalid metadata length read
+  auto const metadata_len = read_int32_from_ipc_message();
+
+  // Check if the read metadata (header) length is > zero
+  if (metadata_len <= 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected metadata length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check if the remaining message size is smaller than the expected metadata length
+  // TODO: Since the arrow:schema message doesn't have a body,
+  // the following check may be made tighter from < to ==
+  if (message_size < metadata_len) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // All good, return the current message_buf as string_view
+  return std::string_view{message_buf,
+                          static_cast<std::basic_string_view<char>::size_type>(message_size)};
 }
 
 RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index,
@@ -548,6 +882,26 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
   return col->meta_data;
 }
 
+std::vector<std::unordered_map<std::string, int64_t>>
+aggregate_reader_metadata::get_rowgroup_metadata() const
+{
+  std::vector<std::unordered_map<std::string, int64_t>> rg_metadata;
+
+  std::for_each(
+    per_file_metadata.cbegin(), per_file_metadata.cend(), [&rg_metadata](auto const& pfm) {
+      std::transform(pfm.row_groups.cbegin(),
+                     pfm.row_groups.cend(),
+                     std::back_inserter(rg_metadata),
+                     [](auto const& rg) {
+                       std::unordered_map<std::string, int64_t> rg_meta_map;
+                       rg_meta_map["num_rows"]        = rg.num_rows;
+                       rg_meta_map["total_byte_size"] = rg.total_byte_size;
+                       return rg_meta_map;
+                     });
+    });
+  return rg_metadata;
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -602,13 +956,15 @@ aggregate_reader_metadata::select_row_groups(
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
+  // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices =
-      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
+    filtered_row_group_indices = filter_row_groups(
+      row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
@@ -619,7 +975,10 @@ aggregate_reader_metadata::select_row_groups(
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
       skip_rows_opt, num_rows_opt, get_num_rows());
-    return std::pair{static_cast<int64_t>(from_opts.first), from_opts.second};
+    CUDF_EXPECTS(from_opts.second <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Number of reading rows exceeds cudf's column size limit.");
+    return std::pair{static_cast<int64_t>(from_opts.first),
+                     static_cast<size_type>(from_opts.second)};
   }();
 
   if (!row_group_indices.empty()) {
@@ -662,10 +1021,12 @@ aggregate_reader_metadata::select_row_groups(
 std::tuple<std::vector<input_column_info>,
            std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
-aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                          bool include_index,
-                                          bool strings_to_categorical,
-                                          type_id timestamp_type_id) const
+aggregate_reader_metadata::select_columns(
+  std::optional<std::vector<std::string>> const& use_names,
+  std::optional<std::vector<std::string>> const& filter_columns_names,
+  bool include_index,
+  bool strings_to_categorical,
+  type_id timestamp_type_id) const
 {
   auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
     auto const& col_schema_idx =
@@ -691,7 +1052,6 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                        int schema_idx,
                        std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
-      if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
 
       // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
@@ -830,13 +1190,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
 
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
-    for (auto const& selected_path : *use_names) {
-      auto found_path =
-        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-          return valid_path.full_path == selected_path;
-        });
-      if (found_path != all_paths.end()) {
-        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+    // vector reference pushback (*use_names). If filter names passed.
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+      *use_names, *filter_columns_names};
+    for (auto const& used_column_names : column_names) {
+      for (auto const& selected_path : used_column_names.get()) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
       }
     }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 8295654764e..9aeb19a7723 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "compact_protocol_reader.hpp"
 #include "parquet_gpu.hpp"
 
 #include <cudf/ast/detail/expression_transformer.hpp>
@@ -25,9 +24,6 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
 #include <list>
 #include <tuple>
 #include <vector>
@@ -121,9 +117,15 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+struct arrow_schema_data_types {
+  std::vector<arrow_schema_data_types> children;
+  data_type type{type_id::EMPTY};
+};
+
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+
   int64_t num_rows;
   size_type num_row_groups;
 
@@ -139,6 +141,25 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
     const;
 
+  /**
+   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * in key value metadata section of Parquet file footer
+   */
+  [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
+
+  /**
+   * @brief Co-walks the collected arrow and Parquet schema, updates
+   * dtypes and destroys the no longer needed arrow schema object(s).
+   */
+  void apply_arrow_schema();
+
+  /**
+   * @brief Decode an arrow:IPC message and returns an optional string_view of
+   * its metadata header
+   */
+  [[nodiscard]] std::optional<std::string_view> decode_ipc_message(
+    std::string_view const serialized_message) const;
+
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -158,7 +179,8 @@ class aggregate_reader_metadata {
   void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const;
 
  public:
-  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources);
+  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
+                            bool use_arrow_schema);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
@@ -166,6 +188,13 @@ class aggregate_reader_metadata {
                                                                size_type src_idx,
                                                                int schema_idx) const;
 
+  /**
+   * @brief Extracts high-level metadata for all row groups
+   *
+   * @return List of maps containing metadata information for each row group
+   */
+  [[nodiscard]] std::vector<std::unordered_map<std::string, int64_t>> get_rowgroup_metadata() const;
+
   [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
@@ -176,8 +205,8 @@ class aggregate_reader_metadata {
   }
 
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
-
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
+
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
    *
@@ -224,7 +253,8 @@ class aggregate_reader_metadata {
    * @brief Filters the row groups based on predicate filter
    *
    * @param row_group_indices Lists of row groups to read, one per source
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Filtered row group indices, if any is filtered.
@@ -232,6 +262,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::reference_wrapper<ast::expression const> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -244,7 +275,8 @@ class aggregate_reader_metadata {
    * @param row_group_indices Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
@@ -255,6 +287,7 @@ class aggregate_reader_metadata {
     int64_t row_start,
     std::optional<size_type> const& row_count,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::optional<std::reference_wrapper<ast::expression const>> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -263,6 +296,7 @@ class aggregate_reader_metadata {
    *
    * @param use_names List of paths of column names to select; `nullopt` if user did not select
    * columns to read
+   * @param filter_columns_names List of paths of column names that are present only in filter
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
@@ -274,6 +308,7 @@ class aggregate_reader_metadata {
                            std::vector<cudf::io::detail::inline_column_buffer>,
                            std::vector<size_type>>
   select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 std::optional<std::vector<std::string>> const& filter_columns_names,
                  bool include_index,
                  bool strings_to_categorical,
                  type_id timestamp_type_id) const;
@@ -286,23 +321,7 @@ class aggregate_reader_metadata {
 class named_to_reference_converter : public ast::detail::expression_transformer {
  public:
   named_to_reference_converter(std::optional<std::reference_wrapper<ast::expression const>> expr,
-                               table_metadata const& metadata)
-    : metadata(metadata)
-  {
-    if (!expr.has_value()) return;
-    // create map for column name.
-    std::transform(
-      thrust::make_zip_iterator(metadata.schema_info.cbegin(),
-                                thrust::counting_iterator<size_t>(0)),
-      thrust::make_zip_iterator(metadata.schema_info.cend(),
-                                thrust::counting_iterator(metadata.schema_info.size())),
-      std::inserter(column_name_to_index, column_name_to_index.end()),
-      [](auto const& name_index) {
-        return std::make_pair(thrust::get<0>(name_index).name, thrust::get<1>(name_index));
-      });
-
-    expr.value().get().accept(*this);
-  }
+                               table_metadata const& metadata);
 
   /**
    * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
@@ -337,7 +356,6 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
     std::vector<std::reference_wrapper<ast::expression const>> operands);
 
-  table_metadata const& metadata;
   std::unordered_map<std::string, size_type> column_name_to_index;
   std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
   // Using std::list or std::deque to avoid reference invalidation
@@ -345,4 +363,15 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
+/**
+ * @brief Get the column names in expression object
+ *
+ * @param expr The optional expression object to get the column names from
+ * @param skip_names The names of column names to skip in returned column names
+ * @return The column names present in expression object except the skip_names
+ */
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e39445108a6..7cb982f103d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
 [[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
                                                      rmm::cuda_stream_view stream)
 {
-  auto const to_mask = [] __device__(auto const& page) {
+  auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
     return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
-  };
+  });
   uint32_t const unsupported = thrust::transform_reduce(
     rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
   return encoding_bitmask_to_str(unsupported);
@@ -636,6 +636,15 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
+constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+{
+  auto const is_decimal =
+    chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
+  auto const is_binary =
+    chunk.physical_type == BYTE_ARRAY or chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return is_binary and not is_decimal;
+}
+
 struct set_str_dict_index_count {
   device_span<size_t> str_dict_index_count;
   device_span<const ColumnChunkDesc> chunks;
@@ -643,8 +652,8 @@ struct set_str_dict_index_count {
   __device__ void operator()(PageInfo const& page)
   {
     auto const& chunk = chunks[page.chunk_idx];
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY &&
-        (chunk.num_dict_pages > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 and chunk.num_dict_pages > 0 and
+        is_string_chunk(chunk)) {
       // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
       str_dict_index_count[page.chunk_idx] = page.num_input_values;
     }
@@ -659,7 +668,7 @@ struct set_str_dict_index_ptr {
   __device__ void operator()(size_t i)
   {
     auto& chunk = chunks[i];
-    if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+    if (chunk.num_dict_pages > 0 and is_string_chunk(chunk)) {
       chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }
@@ -864,7 +873,7 @@ void reader::impl::allocate_nesting_info()
           nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _timestamp_type.id());
+            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -1169,10 +1178,10 @@ struct page_to_string_size {
 struct page_offset_output_iter {
   PageInfo* p;
 
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
+  using value_type        = size_t;
+  using difference_type   = size_t;
+  using pointer           = size_t*;
+  using reference         = size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ page_offset_output_iter operator+(int i) { return {p + i}; }
@@ -1212,26 +1221,29 @@ struct update_pass_num_rows {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_file(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::preprocess_file(read_mode mode)
 {
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
   // if filter is not empty, then create output types as vector and pass for filtering.
-  std::vector<data_type> output_types;
-  if (filter.has_value()) {
-    std::transform(_output_buffers.cbegin(),
-                   _output_buffers.cend(),
-                   std::back_inserter(output_types),
+
+  std::vector<data_type> output_dtypes;
+  if (_expr_conv.get_converted_expr().has_value()) {
+    std::transform(_output_buffers_template.cbegin(),
+                   _output_buffers_template.cend(),
+                   std::back_inserter(output_dtypes),
                    [](auto const& col) { return col.type; });
   }
+
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-    _metadata->select_row_groups(
-      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+    _metadata->select_row_groups(_options.row_group_indices,
+                                 _options.skip_rows,
+                                 _options.num_rows,
+                                 output_dtypes,
+                                 _output_column_schemas,
+                                 _expr_conv.get_converted_expr(),
+                                 _stream);
 
   // check for page indexes
   _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
@@ -1261,7 +1273,7 @@ void reader::impl::preprocess_file(
   printf("# Input columns: %'lu\n", _input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& schema = _metadata->get_schema(_input_columns[idx].schema_idx);
-    auto const type_id = to_type_id(schema, _strings_to_categorical, _timestamp_type.id());
+    auto const type_id = to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id());
     printf("\tC(%'lu, %s): %s\n",
            idx,
            _input_columns[idx].name.c_str(),
@@ -1315,7 +1327,7 @@ void reader::impl::generate_list_column_row_count_estimates()
   _stream.synchronize();
 }
 
-void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1442,7 +1454,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   compute_output_chunks_for_subpass();
 }
 
-void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
+void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1455,7 +1467,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // account. PageInfo::skipped_values, which tells us where to start decoding in the input to
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
-  if (uses_custom_row_bounds) {
+  if (uses_custom_row_bounds(mode)) {
     ComputePageSizes(subpass.pages,
                      pass.chunks,
                      skip_rows,
@@ -1464,8 +1476,6 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                      false,  // no need to compute string sizes
                      pass.level_type_size,
                      _stream);
-
-    // print_pages(pages, _stream);
   }
 
   // iterate over all input columns and allocate any associated output
@@ -1489,8 +1499,10 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       // if we haven't already processed this column because it is part of a struct hierarchy
       else if (out_buf.size == 0) {
         // add 1 for the offset if this is a list column
-        out_buf.create(
+        // we're going to start null mask as all valid and then turn bits off if necessary
+        out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
+          cudf::mask_state::ALL_VALID,
           _stream,
           _mr);
       }
@@ -1568,7 +1580,8 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
           if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
 
           // allocate
-          out_buf.create(size, _stream, _mr);
+          // we're going to start null mask as all valid and then turn bits off if necessary
+          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
         }
       }
     }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5a8d96975ce..1dfced94f5b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,8 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/parquet/parquet.hpp"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
@@ -38,6 +40,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -51,6 +54,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <numeric>
 #include <utility>
 
@@ -213,6 +217,53 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
   }
 }
 
+/**
+ * @brief Update the encoding_stats field in the column chunk metadata.
+ *
+ * @param chunk_meta The `ColumnChunkMetaData` struct for the column chunk
+ * @param ck The column chunk to summarize stats for
+ * @param is_v2 True if V2 page headers are used
+ */
+void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta,
+                                 EncColumnChunk const& ck,
+                                 bool is_v2)
+{
+  // don't set encoding stats if there are no pages
+  if (ck.num_pages == 0) { return; }
+
+  // NOTE: since cudf doesn't use mixed encodings for a chunk, we really only need to account
+  // for the dictionary page (if there is one), and the encoding used for the data pages. We can
+  // examine the chunk's encodings field to figure out the encodings without having to examine
+  // the page data.
+  auto const num_data_pages = static_cast<int32_t>(ck.num_data_pages());
+  auto const data_page_type = is_v2 ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
+
+  std::vector<PageEncodingStats> result;
+  if (ck.use_dictionary) {
+    // For dictionary encoding, if V1 then both data and dictionary use PLAIN_DICTIONARY. For V2
+    // the dictionary uses PLAIN and the data RLE_DICTIONARY.
+    auto const dict_enc = is_v2 ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    auto const data_enc = is_v2 ? Encoding::RLE_DICTIONARY : Encoding::PLAIN_DICTIONARY;
+    result.push_back({PageType::DICTIONARY_PAGE, dict_enc, 1});
+    if (num_data_pages > 0) { result.push_back({data_page_type, data_enc, num_data_pages}); }
+  } else {
+    // No dictionary page, the pages are encoded with something other than RLE (unless it's a
+    // boolean column).
+    for (auto const enc : chunk_meta.encodings) {
+      if (enc != Encoding::RLE) {
+        result.push_back({data_page_type, enc, num_data_pages});
+        break;
+      }
+    }
+    // if result is empty and we're using V2 headers, then assume the data is RLE as well
+    if (result.empty() and is_v2 and (ck.encodings & encoding_to_mask(Encoding::RLE)) != 0) {
+      result.push_back({data_page_type, Encoding::RLE, num_data_pages});
+    }
+  }
+
+  if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -228,8 +279,9 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
     return size_of(column.type()) * column.size();
   } else if (column.type().id() == type_id::STRING) {
     auto const scol = strings_column_view(column);
-    return cudf::detail::get_value<size_type>(scol.offsets(), column.size(), stream) -
-           cudf::detail::get_value<size_type>(scol.offsets(), 0, stream);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
   } else if (column.type().id() == type_id::STRUCT) {
     auto const scol = structs_column_view(column);
     size_t ret      = 0;
@@ -274,6 +326,7 @@ struct schema_tree_node : public SchemaElement {
   statistics_dtype stats_dtype;
   int32_t ts_scale;
   column_encoding requested_encoding;
+  bool skip_compression;
 
   // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The
   // function construct_schema_tree could be its constructor. It can have method to get the per
@@ -611,8 +664,7 @@ std::vector<schema_tree_node> construct_schema_tree(
                                                 column_in_metadata const& col_meta) {
         s.requested_encoding = column_encoding::USE_DEFAULT;
 
-        if (schema[parent_idx].name != "list" and
-            col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
+        if (s.name != "list" and col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
           // do some validation
           switch (col_meta.get_encoding()) {
             case column_encoding::DELTA_BINARY_PACKED:
@@ -657,6 +709,21 @@ std::vector<schema_tree_node> construct_schema_tree(
               }
               break;
 
+            case column_encoding::BYTE_STREAM_SPLIT:
+              if (s.type == Type::BYTE_ARRAY) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is only supported for fixed width columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              if (s.type == Type::INT96) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is not supported for INT96 columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              break;
+
             // supported parquet encodings
             case column_encoding::PLAIN:
             case column_encoding::DICTIONARY: break;
@@ -688,7 +755,14 @@ std::vector<schema_tree_node> construct_schema_tree(
         }
 
         schema_tree_node col_schema{};
-        col_schema.type            = Type::BYTE_ARRAY;
+        // test if this should be output as FIXED_LEN_BYTE_ARRAY
+        if (col_meta.is_type_length_set()) {
+          col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
+          col_schema.type_length = col_meta.get_type_length();
+        } else {
+          col_schema.type = Type::BYTE_ARRAY;
+        }
+
         col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
@@ -698,6 +772,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
         col_schema.output_as_byte_array = col_meta.is_enabled_output_as_binary();
+        col_schema.skip_compression     = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       } else if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
@@ -833,6 +908,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         col_schema.leaf_column = col;
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
+        col_schema.skip_compression = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       }
     };
@@ -1006,6 +1082,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
+  desc.type_length = schema_node.type_length;
 
   if (is_list()) {
     desc.level_offsets = _dremel_offsets.data();
@@ -1023,6 +1100,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   desc.max_def_level      = _max_def_level;
   desc.max_rep_level      = _max_rep_level;
   desc.requested_encoding = schema_node.requested_encoding;
+  desc.skip_compression   = schema_node.skip_compression;
   return desc;
 }
 
@@ -1247,8 +1325,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
       chunk_col_desc.requested_encoding != column_encoding::DICTIONARY;
     auto const is_type_non_dict =
-      chunk_col_desc.physical_type == Type::BOOLEAN ||
-      (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY);
+      chunk_col_desc.physical_type == Type::BOOLEAN || chunk_col_desc.output_as_byte_array;
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
@@ -1396,16 +1473,13 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
 }
 
 /**
- * @brief Encode a batch of pages.
+ * @brief Encode pages.
  *
  * @throws rmm::bad_alloc if there is insufficient space for temporary buffers
  *
  * @param chunks column chunk array
  * @param pages encoder pages array
- * @param pages_in_batch number of pages in this batch
- * @param first_page_in_batch first page in batch
- * @param rowgroups_in_batch number of rowgroups in this batch
- * @param first_rowgroup first rowgroup in batch
+ * @param num_rowgroups number of rowgroups
  * @param page_stats optional page-level statistics (nullptr if none)
  * @param chunk_stats optional chunk-level statistics (nullptr if none)
  * @param column_stats optional page-level statistics for column index (nullptr if none)
@@ -1417,10 +1491,6 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
  */
 void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   device_span<EncPage> pages,
-                  uint32_t pages_in_batch,
-                  uint32_t first_page_in_batch,
-                  uint32_t rowgroups_in_batch,
-                  uint32_t first_rowgroup,
                   statistics_chunk const* page_stats,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
@@ -1430,14 +1500,12 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
-  auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
+  auto const num_pages = pages.size();
+  auto pages_stats     = (page_stats != nullptr)
+                           ? device_span<statistics_chunk const>(page_stats, num_pages)
+                           : device_span<statistics_chunk const>();
 
-  auto batch_pages_stats =
-    (page_stats != nullptr)
-      ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
-      : device_span<statistics_chunk const>();
-
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1447,7 +1515,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1480,25 +1548,23 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
 
-  auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
-  DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream);
-  GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
+  auto d_chunks = chunks.device_view();
+  DecideCompression(d_chunks.flat_view(), stream);
+  EncodePageHeaders(pages, comp_res, pages_stats, chunk_stats, stream);
+  GatherPages(d_chunks.flat_view(), pages, stream);
 
   // By now, the var_bytes has been calculated in InitPages, and the histograms in EncodePages.
   // EncodeColumnIndexes can encode the histograms in the ColumnIndex, and also sum up var_bytes
   // and the histograms for inclusion in the chunk's SizeStats.
   if (column_stats != nullptr) {
-    EncodeColumnIndexes(d_chunks_in_batch.flat_view(),
-                        {column_stats, pages.size()},
-                        column_index_truncate_length,
-                        stream);
+    EncodeColumnIndexes(
+      d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                                d_chunks_in_batch.data(),
-                                d_chunks_in_batch.flat_view().size_bytes(),
+  auto h_chunks = chunks.host_view();
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
+                                d_chunks.data(),
+                                d_chunks.flat_view().size_bytes(),
                                 cudaMemcpyDefault,
                                 stream.value()));
 
@@ -1959,33 +2025,23 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
   }
 
-  // Batch processing is no longer supported.
-  // This line disables batch processing (so batch size will no longer be limited at 1GB as before).
-  // TODO: All the relevant code will be removed in the follow-up work:
-  // https://github.com/rapidsai/cudf/issues/13440
-  auto const max_bytes_in_batch = std::numeric_limits<size_t>::max();
-
-  // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<size_type> batch_list;
-  size_type num_pages           = 0;
-  size_t max_uncomp_bfr_size    = 0;
-  size_t max_comp_bfr_size      = 0;
-  size_t max_chunk_bfr_size     = 0;
-  size_type max_pages_in_batch  = 0;
-  size_t bytes_in_batch         = 0;
-  size_t comp_bytes_in_batch    = 0;
+  // Initialize rowgroups to encode
+  size_type num_pages        = 0;
+  size_t max_uncomp_bfr_size = 0;
+  size_t max_comp_bfr_size   = 0;
+  size_t max_chunk_bfr_size  = 0;
+
   size_t column_index_bfr_size  = 0;
   size_t def_histogram_bfr_size = 0;
   size_t rep_histogram_bfr_size = 0;
-  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
-    size_t rowgroup_size      = 0;
-    size_t comp_rowgroup_size = 0;
+  size_t rowgroup_size          = 0;
+  size_t comp_rowgroup_size     = 0;
+  for (size_type r = 0; r <= num_rowgroups; r++) {
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
         EncColumnChunk* ck = &chunks[r][i];
         ck->first_page     = num_pages;
         num_pages += ck->num_pages;
-        pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
         comp_rowgroup_size += ck->compressed_size;
         max_chunk_bfr_size =
@@ -2007,29 +2063,17 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-    // TBD: We may want to also shorten the batch if we have enough pages (not just based on size)
-    if ((r == num_rowgroups) ||
-        (groups_in_batch != 0 && bytes_in_batch + rowgroup_size > max_bytes_in_batch)) {
-      max_uncomp_bfr_size = std::max(max_uncomp_bfr_size, bytes_in_batch);
-      max_comp_bfr_size   = std::max(max_comp_bfr_size, comp_bytes_in_batch);
-      max_pages_in_batch  = std::max(max_pages_in_batch, pages_in_batch);
-      if (groups_in_batch != 0) {
-        batch_list.push_back(groups_in_batch);
-        groups_in_batch = 0;
-      }
-      bytes_in_batch      = 0;
-      comp_bytes_in_batch = 0;
-      pages_in_batch      = 0;
+    // write bfr sizes if this is the last rowgroup
+    if (r == num_rowgroups) {
+      max_uncomp_bfr_size = rowgroup_size;
+      max_comp_bfr_size   = comp_rowgroup_size;
     }
-    bytes_in_batch += rowgroup_size;
-    comp_bytes_in_batch += comp_rowgroup_size;
-    groups_in_batch++;
   }
 
   // Clear compressed buffer size if compression has been turned off
   if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
-  // Initialize data pointers in batch
+  // Initialize data pointers
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
 
@@ -2055,10 +2099,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   auto bfr_i = static_cast<uint8_t*>(col_idx_bfr.data());
   auto bfr_r = rep_level_histogram.data();
   auto bfr_d = def_level_histogram.data();
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+  if (num_rowgroups != 0) {
     auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (auto j = 0; j < batch_list[b]; j++, r++) {
+    for (auto r = 0; r < num_rowgroups; r++) {
       for (auto i = 0; i < num_columns; i++) {
         EncColumnChunk& ck   = chunks[r][i];
         ck.uncompressed_bfr  = bfr;
@@ -2108,22 +2152,11 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::optional<writer_compression_statistics> comp_stats;
   if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; }
 
-  // Encode row groups in batches
-  for (auto b = 0, batch_r_start = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    // Count pages in this batch
-    auto const rnext               = batch_r_start + batch_list[b];
-    auto const first_page_in_batch = chunks[batch_r_start][0].first_page;
-    auto const first_page_in_next_batch =
-      (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
-
+  // Encode row groups
+  if (num_rowgroups != 0) {
     encode_pages(
       chunks,
       {pages.data(), pages.size()},
-      pages_in_batch,
-      first_page_in_batch,
-      batch_list[b],
-      batch_r_start,
       (stats_granularity == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
       (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                               : nullptr,
@@ -2152,7 +2185,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     }
 
-    for (int r = batch_r_start; r < rnext; r++) {
+    for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
       int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group = agg_meta->file(p).row_groups[global_r];
@@ -2169,6 +2202,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
         update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+        update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
@@ -2179,6 +2213,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
 
         row_group.total_byte_size += ck.bfr_size;
+        row_group.total_compressed_size =
+          row_group.total_compressed_size.value_or(0) + ck.compressed_size;
         column_chunk_meta.total_uncompressed_size = ck.bfr_size;
         column_chunk_meta.total_compressed_size   = ck.compressed_size;
       }
@@ -2192,7 +2228,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       auto h_def_ptr = h_def_histogram.data();
       auto h_rep_ptr = h_rep_histogram.data();
 
-      for (int r = batch_r_start; r < rnext; r++) {
+      for (int r = 0; r < num_rowgroups; r++) {
         int const p        = rg_to_part[r];
         int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto& row_group    = agg_meta->file(p).row_groups[global_r];
@@ -2239,8 +2275,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-
-    batch_r_start = rnext;
   }
 
   auto bounce_buffer =
@@ -2251,7 +2285,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                     std::move(chunks),
                     std::move(global_rowgroup_base),
                     std::move(first_rg_in_part),
-                    std::move(batch_list),
                     std::move(rg_to_part),
                     std::move(comp_stats),
                     std::move(uncomp_bfr),
@@ -2279,6 +2312,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2308,6 +2342,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2358,7 +2393,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                          chunks,
                          global_rowgroup_base,
                          first_rg_in_part,
-                         batch_list,
                          rg_to_part,
                          comp_stats,
                          uncomp_bfr,   // unused, but contains data for later write to sink
@@ -2402,7 +2436,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                              chunks,
                              global_rowgroup_base,
                              first_rg_in_part,
-                             batch_list,
                              rg_to_part,
                              bounce_buffer);
 
@@ -2417,18 +2450,17 @@ void writer::impl::write_parquet_data_to_sink(
   host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
-  host_span<size_type const> batch_list,
   host_span<int const> rg_to_part,
   host_span<uint8_t> bounce_buffer)
 {
-  _agg_meta              = std::move(updated_agg_meta);
-  auto const num_columns = chunks.size().second;
+  _agg_meta                = std::move(updated_agg_meta);
+  auto const num_rowgroups = chunks.size().first;
+  auto const num_columns   = chunks.size().second;
 
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    auto const rnext = r + batch_list[b];
+  if (num_rowgroups != 0) {
     std::vector<std::future<void>> write_tasks;
 
-    for (; r < rnext; r++) {
+    for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
       int const p        = rg_to_part[r];
       int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group    = _agg_meta->file(p).row_groups[global_r];
@@ -2454,12 +2486,15 @@ void writer::impl::write_parquet_data_to_sink(
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
+        auto const chunk_offset = _current_chunk_offset[p];
         auto& column_chunk_meta = row_group.columns[i].meta_data;
         column_chunk_meta.data_page_offset =
-          _current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0);
-        column_chunk_meta.dictionary_page_offset =
-          (ck.use_dictionary) ? _current_chunk_offset[p] : 0;
+          chunk_offset + ((ck.use_dictionary) ? ck.dictionary_size : 0);
+        column_chunk_meta.dictionary_page_offset = (ck.use_dictionary) ? chunk_offset : 0;
         _current_chunk_offset[p] += ck.compressed_size;
+
+        // save location of first page in row group
+        if (i == 0) { row_group.file_offset = chunk_offset; }
       }
     }
     for (auto const& task : write_tasks) {
@@ -2472,10 +2507,9 @@ void writer::impl::write_parquet_data_to_sink(
     auto const h_pages = cudf::detail::make_host_vector_sync(pages, _stream);
 
     // add column and offset indexes to metadata
-    for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-      auto const rnext   = r + batch_list[b];
-      auto curr_page_idx = chunks[r][0].first_page;
-      for (; r < rnext; r++) {
+    if (num_rowgroups != 0) {
+      auto curr_page_idx = chunks[0][0].first_page;
+      for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
         int const p           = rg_to_part[r];
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
@@ -2535,10 +2569,9 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
     std::vector<uint8_t> buffer;
     CompactProtocolWriter cpw(&buffer);
     file_ender_s fendr;
+    auto& fmd = _agg_meta->file(p);
 
     if (_stats_granularity == statistics_freq::STATISTICS_COLUMN) {
-      auto& fmd = _agg_meta->file(p);
-
       // write column indices, updating column metadata along the way
       int chunkidx = 0;
       for (auto& r : fmd.row_groups) {
@@ -2564,6 +2597,26 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
       }
     }
 
+    // set row group ordinals
+    auto iter        = thrust::make_counting_iterator(0);
+    auto& row_groups = fmd.row_groups;
+    std::for_each(
+      iter, iter + row_groups.size(), [&row_groups](auto idx) { row_groups[idx].ordinal = idx; });
+
+    // set sorting_columns on row groups
+    if (_sorting_columns.has_value()) {
+      // convert `sorting_column` to `SortingColumn`
+      auto const& sorting_cols = _sorting_columns.value();
+      std::vector<SortingColumn> scols;
+      std::transform(
+        sorting_cols.begin(), sorting_cols.end(), std::back_inserter(scols), [](auto const& sc) {
+          return SortingColumn{sc.column_idx, sc.is_descending, sc.is_nulls_first};
+        });
+      // and copy to each row group
+      std::for_each(iter, iter + row_groups.size(), [&row_groups, &scols](auto idx) {
+        row_groups[idx].sorting_columns = scols;
+      });
+    }
     buffer.resize(0);
     fendr.footer_len = static_cast<uint32_t>(cpw.write(_agg_meta->get_metadata(p)));
     fendr.magic      = parquet_magic;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 2f6608b0ae7..784f78f06d5 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -129,7 +129,6 @@ class writer::impl {
    * @param chunks Column chunks
    * @param global_rowgroup_base Numbers of rowgroups in each file/partition
    * @param first_rg_in_part The first rowgroup in each partition
-   * @param batch_list The batches of rowgroups to encode
    * @param rg_to_part A map from rowgroup to partition
    * @param[out] bounce_buffer Temporary host output buffer
    */
@@ -138,7 +137,6 @@ class writer::impl {
                                   host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
-                                  host_span<size_type const> batch_list,
                                   host_span<int const> rg_to_part,
                                   host_span<uint8_t> bounce_buffer);
 
@@ -158,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
   cudf::io::detail::single_write_mode const
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 8e37564fc35..976d735e010 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -37,6 +37,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -306,7 +307,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               byte_range_info byte_range,
                                               bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -565,7 +566,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(
     source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
@@ -574,7 +575,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto stream = cudf::get_default_stream();
 
@@ -586,7 +587,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(source, delimiter, parse_options{}, mr);
 }
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
new file mode 100644
index 00000000000..856c29599a7
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding implementation
+ */
+
+// altered: applying clang-format for libcudf on this file.
+
+#include "base64_utilities.hpp"
+
+#include <cudf/detail/utilities/logger.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+static const std::string base64_chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+
+static constexpr unsigned char trailing_char = '=';
+
+// Function to encode input string to base64 and return the encoded string
+std::string base64_encode(std::string_view string_to_encode)
+{
+  auto input_length = static_cast<int32_t>(string_to_encode.size());
+
+  // altered: compute number of encoding iterations = floor(multiple of 3)
+  int32_t num_iterations = (input_length / 3);
+  num_iterations += (input_length % 3) ? 1 : 0;
+
+  std::string encoded;
+  size_t encoded_length = (input_length + 2) / 3 * 4;
+  encoded.reserve(encoded_length);
+
+  // altered: modify base64 encoder loop using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  std::for_each(thrust::make_counting_iterator(0),
+                thrust::make_counting_iterator(num_iterations),
+                [&](auto&& iter) {
+                  auto idx = iter * 3;
+
+                  encoded.push_back(base64_chars[(string_to_encode[idx] & 0xfc) >> 2]);
+                  // increment the index by 1
+                  idx += 1;
+
+                  if (idx < input_length) {
+                    encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x03) << 4) +
+                                                   ((string_to_encode[idx] & 0xf0) >> 4)]);
+                    // increment the index by 1
+                    idx += 1;
+
+                    if (idx < input_length) {
+                      encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x0f) << 2) +
+                                                     ((string_to_encode[idx] & 0xc0) >> 6)]);
+                      encoded.push_back(base64_chars[string_to_encode[idx] & 0x3f]);
+                    } else {
+                      encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x0f) << 2]);
+                      encoded.push_back(trailing_char);
+                    }
+                  } else {
+                    encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x03) << 4]);
+                    encoded.push_back(trailing_char);
+                    encoded.push_back(trailing_char);
+                  }
+                });
+
+  return encoded;
+}
+
+// base64 decode function
+std::string base64_decode(std::string_view encoded_string)
+{
+  // altered: there must be at least 2 characters in the base64-encoded string
+  if (encoded_string.size() < 2) {
+    CUDF_LOG_ERROR(
+      "Parquet reader encountered invalid base64-encoded string size."
+      "arrow:schema not processed.");
+    return std::string{};
+  }
+
+  size_t input_length = encoded_string.length();
+  std::string decoded;
+
+  // altered: compute number of decoding iterations = floor (multiple of 4)
+  int32_t num_iterations = (input_length / 4);
+  num_iterations += (input_length % 4) ? 1 : 0;
+
+  //
+  // The approximate length (bytes) of the decoded string might be one or
+  // two bytes smaller, depending on the amount of trailing equal signs
+  // in the encoded string. This approximation is needed to reserve
+  // enough space in the string to be returned.
+  size_t approx_decoded_length = input_length / 4 * 3;
+  decoded.reserve(approx_decoded_length);
+
+  //
+  // Iterate over encoded input string in chunks. The size of all
+  // chunks except the last one is 4 bytes.
+  //
+  // The last chunk might be padded with equal signs or dots
+  // in order to make it 4 bytes in size as well, but this
+  // is not required as per RFC 2045.
+  //
+  // All chunks except the last one produce three output bytes.
+  //
+  // The last chunk produces at least one and up to three bytes.
+  //
+  // altered: modify base64 encoder loop to number of iterations using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  if (not std::all_of(
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_iterations),
+        [&](auto&& iter) {
+          int32_t idx                  = iter * 4;
+          size_t current_char_position = 0;
+          size_t char1_position        = 0;
+          size_t char2_position        = 0;
+
+          // Check for data that is not padded with equal
+          // signs (which is allowed by RFC 2045)
+          if (encoded_string[idx] == '=') { return true; }
+
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          char1_position        = base64_chars.find(encoded_string[idx + 1]);
+          if (current_char_position == std::string::npos or char1_position == std::string::npos) {
+            return false;
+          }
+          // Emit the first output byte that is produced in each chunk:
+          decoded.push_back(static_cast<std::string::value_type>((current_char_position << 2) +
+                                                                 ((char1_position & 0x30) >> 4)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char1_position = base64_chars.find(encoded_string[idx - 1]);
+          char2_position = base64_chars.find(encoded_string[idx]);
+          if (char1_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's second byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char1_position & 0x0f) << 4) +
+                                                                 ((char2_position & 0x3c) >> 2)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char2_position        = base64_chars.find(encoded_string[idx - 1]);
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          if (current_char_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's third byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char2_position & 0x03) << 6) +
+                                                                 current_char_position));
+
+          // all good, return true
+          return true;
+        })) {
+    return std::string{};
+  }
+
+  // return the decoded string
+  return decoded;
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/base64_utilities.hpp b/cpp/src/io/utilities/base64_utilities.hpp
new file mode 100644
index 00000000000..537d9c96d6b
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding utilities
+ */
+
+#pragma once
+
+// altered: applying clang-format for libcudf on this file.
+
+// altered: include required headers
+#include <string>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+/**
+ * @brief Encodes input string to base64 and returns it
+ *
+ * @param string_to_encode a view of the string to be encoded in base64
+ * @return the base64-encoded string
+ *
+ */
+std::string base64_encode(std::string_view string_to_encode);
+
+/**
+ * @brief Decodes the input base64-encoded string and returns it
+ *
+ * @param encoded_string a view of the base64-encoded string to be decoded
+ * @return the decoded string
+ *
+ */
+std::string base64_decode(std::string_view encoded_string);
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 96503e4907b..e5d4e1a360f 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iomanip>
 #include <sstream>
@@ -68,16 +69,6 @@ void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream)
-{
-  // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto offsets_col = std::make_unique<column>(
-    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
-  return make_strings_column(
-    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
-}
-
 namespace {
 
 /**
@@ -100,9 +91,10 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 }  // namespace
 
 template <class string_policy>
-void column_buffer_base<string_policy>::create(size_type _size,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+void column_buffer_base<string_policy>::create_with_mask(size_type _size,
+                                                         cudf::mask_state null_mask_state,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
 {
   size = _size;
   _mr  = mr;
@@ -120,11 +112,19 @@ void column_buffer_base<string_policy>::create(size_type _size,
     default: _data = create_data(type, size, stream, _mr); break;
   }
   if (is_nullable) {
-    _null_mask = cudf::detail::create_null_mask(
-      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+    _null_mask =
+      cudf::detail::create_null_mask(size, null_mask_state, rmm::cuda_stream_view(stream), _mr);
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+}
+
 template <class string_policy>
 string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
@@ -197,6 +197,11 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
           schema_info->children.push_back(column_name_info{"binary"});
+          // cuDF type will be list<UINT8>, but remember it was originally binary data
+          schema_info->is_binary = true;
+          if (schema.has_value() and schema->get_type_length() > 0) {
+            schema_info->type_length = schema->get_type_length();
+          }
         }
 
         return make_lists_column(
@@ -286,7 +291,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -357,12 +362,12 @@ template std::unique_ptr<column> make_column<pointer_type>(
 template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
                                                          column_name_info* schema_info,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
                                                           column_name_info* schema_info,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 template std::string type_to_name<string_type>(string_column_buffer const& buffer);
 template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer);
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 57ee1043ee9..e6bfae0681a 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -31,6 +31,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -50,7 +52,7 @@ namespace detail {
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
@@ -96,7 +98,7 @@ class column_buffer_base {
                      size_type _size,
                      bool _is_nullable,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
     : column_buffer_base(_type, _is_nullable)
   {
   }
@@ -111,7 +113,14 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+  // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
+  // something other than `ALL_NULL`
+  void create_with_mask(size_type _size,
+                        cudf::mask_state null_mask_state,
+                        rmm::cuda_stream_view stream,
+                        rmm::device_async_resource_ref mr);
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
@@ -140,7 +149,7 @@ class column_buffer_base {
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::mr::device_memory_resource* _mr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   data_type type{type_id::EMPTY};
@@ -174,7 +183,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -208,7 +217,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -251,7 +260,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Given a column_buffer, produce a formatted name string describing the type.
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
new file mode 100644
index 00000000000..4bc303a34a5
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "column_buffer.hpp"
+
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::detail {
+
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
+{
+  // if the size of _string_data is over the threshold for 64bit size_type, _data will contain
+  // sizes rather than offsets. need special handling for that case.
+  auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
+  if (_string_data.size() > threshold) {
+    if (not strings::detail::is_large_strings_enabled()) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+    // create new offsets
+    auto const offsets_ptr = static_cast<size_type*>(_data.data());
+    auto offsets_col       = make_numeric_column(
+      data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
+    auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
+    // it's safe to call with size + 1 because _data is also sized that large
+    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  } else {
+    // no need for copies, just transfer ownership of the data_buffers to the columns
+    auto offsets_col = std::make_unique<column>(
+      data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  }
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 2f7a6131e3d..dad1135e766 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,10 +16,13 @@
 
 #include "config_utils.hpp"
 
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -87,38 +90,204 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-inline std::mutex& host_mr_lock()
+}  // namespace detail
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void do_deallocate_async(void* ptr,
+                           std::size_t bytes,
+                           std::size_t alignment,
+                           cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = do_allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, alignment, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "");
+
+}  // namespace
+
+CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
+{
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      size_t free{}, total{};
+      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = (size + 255) & ~255;
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+CUDF_EXPORT std::mutex& host_mr_mutex()
 {
   static std::mutex map_lock;
   return map_lock;
 }
 
-inline rmm::host_async_resource_ref default_pinned_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts,
+                                                       bool* did_configure = nullptr)
 {
-  static rmm::mr::pinned_host_memory_resource default_mr{};
-  return default_mr;
+  static rmm::host_async_resource_ref* mr_ref = nullptr;
+  bool configured                             = false;
+  if (mr_ref == nullptr) {
+    configured = true;
+    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
+  }
+
+  // If the user passed an out param to detect whether this call configured a resource
+  // set the result
+  if (did_configure != nullptr) { *did_configure = configured; }
+
+  return *mr_ref;
 }
 
-CUDF_EXPORT inline auto& host_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
 {
-  static rmm::host_async_resource_ref host_mr = default_pinned_mr();
-  return host_mr;
+  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
 }
 
-}  // namespace detail
-
 rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  auto last_mr      = detail::host_mr();
-  detail::host_mr() = mr;
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
   return last_mr;
 }
 
 rmm::host_async_resource_ref get_host_memory_resource()
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  return detail::host_mr();
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+bool config_default_host_memory_resource(host_mr_options const& opts)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  auto did_configure = false;
+  make_host_mr(opts, &did_configure);
+  return did_configure;
 }
 
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 4b5d47e71fb..60cbfbc0dae 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -31,8 +31,10 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
@@ -782,7 +784,8 @@ template <typename SymbolT>
 struct to_string_view_pair {
   SymbolT const* data;
   to_string_view_pair(SymbolT const* _data) : data(_data) {}
-  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  __device__ thrust::pair<char const*, std::size_t> operator()(
+    thrust::tuple<size_type, size_type> ip)
   {
     return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
                                                   static_cast<std::size_t>(thrust::get<1>(ip))};
@@ -796,7 +799,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             rmm::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   //  CUDF_FUNC_RANGE();
 
@@ -804,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     rmm::exec_policy(stream),
     str_tuples,
     str_tuples + col_size,
-    [] __device__(auto t) { return t.second; },
+    cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
     size_type{0},
     thrust::maximum<size_type>{});
 
@@ -914,7 +917,7 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 5557648ebbe..66905c5256f 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -36,6 +36,7 @@ class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
   {
+    detail::force_init_cuda_context();
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 54e7c6bf1d6..d8dbd3614c8 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -43,12 +43,8 @@ class file_source : public datasource {
  public:
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
+    detail::force_init_cuda_context();
     if (detail::cufile_integration::is_kvikio_enabled()) {
-      // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
-      // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is
-      // already initialized
-      cudaFree(0);
-
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 01090a43a0e..39031526fc8 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -34,6 +34,14 @@ namespace cudf {
 namespace io {
 namespace detail {
 
+void force_init_cuda_context()
+{
+  // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
+  // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is already
+  // initialized.
+  cudaFree(0);
+}
+
 [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create)
 {
   // save errno because it may be overwritten by subsequent calls
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0d5a5b218da..91ef41fba6e 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -17,10 +17,10 @@
 #pragma once
 
 #ifdef CUFILE_FOUND
-#include "thread_pool.hpp"
-
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/utilities/thread_pool.hpp>
+
 #include <cufile.h>
 #endif
 
@@ -37,6 +37,9 @@ namespace detail {
 
 [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create);
 
+// Call before any cuFile API calls to ensure the CUDA context is initialized.
+void force_init_cuda_context();
+
 /**
  * @brief Class that provides RAII for file handling.
  */
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 1858912a871..a7517983cd3 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 
@@ -206,7 +207,7 @@ class output_builder {
   output_builder(size_type max_write_size,
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
@@ -307,7 +308,7 @@ class output_builder {
    * @return The output vector.
    */
   rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr) const
+                                rmm::device_async_resource_ref mr) const
   {
     rmm::device_uvector<T> output{size(), stream, mr};
     auto output_it = output.begin();
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 06a0a63c0ab..faee05541cc 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -63,6 +63,7 @@ struct parse_options_view {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -80,6 +81,7 @@ struct parse_options {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -105,6 +107,7 @@ struct parse_options {
             thousands,
             comment,
             keepquotes,
+            detect_whitespace_around_quotes,
             doublequote,
             dayfirst,
             skipblanklines,
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index f136cd11ff7..c0bbca39167 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -23,20 +23,17 @@
 
 namespace cudf::io::detail {
 
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
-  if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows exceeds the column size limit",
-                 std::overflow_error);
-    return {rows_to_skip, num_source_rows - rows_to_skip};
-  }
+  auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
+  auto const num_rows_can_read = num_source_rows - rows_to_skip;
+
+  if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; }
+
   // Limit the number of rows to the end of the input
-  return {
-    rows_to_skip,
-    static_cast<size_type>(std::min<int64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
+  return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 0b5d3aef8bd..7fdcc65d77b 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -34,7 +34,8 @@ namespace cudf::io::detail {
  *
  * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows);
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index a98660c98a9..612889af74b 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
@@ -74,6 +75,6 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace json::detail
 }  // namespace cudf::io
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index cc729ad5e8b..f02dee5f7f5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -30,12 +30,106 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
 namespace cudf {
 namespace detail {
 
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  join_kind join_type,
+  std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN: [[fallthrough]];
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a Boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  detail::grid_1d const config(left.num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // TODO: Remove the output_size parameter. It is not needed because the
+  // output size is bounded by the size of the left table.
+  std::size_t join_size;
+  if (output_size.has_value()) {
+    join_size = *output_size;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    if (has_nulls) {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    } else {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    }
+    join_size = size.value(stream);
+  }
+
+  if (left.num_rows() == 0) {
+    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  }
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+
+  if (has_nulls) {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  } else {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  }
+  return left_indices;
+}
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
@@ -44,14 +138,12 @@ conditional_join(table_view const& left,
                  join_kind join_type,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr)
+                 rmm::device_async_resource_ref mr)
 {
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
   // null index for the right table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
@@ -66,7 +158,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -100,8 +192,8 @@ conditional_join(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind const kernel_join_type =
@@ -186,7 +278,7 @@ conditional_join(table_view const& left,
   // by any row in the left table.
   if (join_type == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
@@ -197,7 +289,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind join_type,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -209,21 +301,19 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return left_num_rows;
+      case join_kind::FULL_JOIN: return left.num_rows();
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -231,7 +321,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       // Full joins need to return the trivial complement.
-      case join_kind::FULL_JOIN: return right_num_rows;
+      case join_kind::FULL_JOIN: return right.num_rows();
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   }
@@ -253,8 +343,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
@@ -293,7 +383,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -311,7 +401,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -328,7 +418,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -345,17 +435,16 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -363,23 +452,22 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -389,7 +477,7 @@ std::size_t conditional_inner_join_size(table_view const& left,
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -399,7 +487,7 @@ std::size_t conditional_left_join_size(table_view const& left,
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
@@ -413,7 +501,7 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 9bc6024ee7e..06eb83d6ba8 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -49,7 +50,7 @@ conditional_join(table_view const& left,
                  join_kind JoinKind,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr);
+                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the size of a join operation between two tables without
@@ -68,7 +69,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind JoinKind,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index cc57fa7b03b..1e16c451f5a 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
-  auto const stride    = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride    = cudf::detail::grid_1d::grid_stride<block_size>();
 
   cudf::thread_index_type const left_num_rows  = left_table.num_rows();
   cudf::thread_index_type const right_num_rows = right_table.num_rows();
@@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -271,6 +271,100 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
   }
 }
 
+template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
+CUDF_KERNEL void conditional_join_anti_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  join_kind join_type,
+  cudf::size_type* join_output_l,
+  cudf::size_type* current_idx,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const max_size)
+{
+  constexpr int num_warps = block_size / detail::warp_size;
+  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
+
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const outer_num_rows = left_table.num_rows();
+  cudf::thread_index_type const inner_num_rows = right_table.num_rows();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+
+  if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+
+  __syncwarp();
+
+  unsigned int const activemask = __ballot_sync(0xffff'ffffu, start_idx < outer_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    bool found_match = false;
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+
+      evaluator.evaluate(
+        output_dest, outer_row_index, inner_row_index, 0, thread_intermediate_storage);
+
+      if (output_dest.is_valid() && output_dest.value()) {
+        if (join_type == join_kind::LEFT_SEMI_JOIN && !found_match) {
+          add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+        }
+        found_match = true;
+      }
+
+      __syncwarp(activemask);
+
+      auto const do_flush   = current_idx_shared[warp_id] + detail::warp_size >= output_cache_size;
+      auto const flush_mask = __ballot_sync(activemask, do_flush);
+      if (do_flush) {
+        flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                         max_size,
+                                                         warp_id,
+                                                         lane_id,
+                                                         current_idx,
+                                                         current_idx_shared,
+                                                         join_shared_l,
+                                                         join_output_l);
+        __syncwarp(flush_mask);
+        if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+      }
+      __syncwarp(activemask);
+    }
+
+    if ((join_type == join_kind::LEFT_ANTI_JOIN) && (!found_match)) {
+      add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+    }
+
+    __syncwarp(activemask);
+
+    auto const do_flush   = current_idx_shared[warp_id] > 0;
+    auto const flush_mask = __ballot_sync(activemask, do_flush);
+    if (do_flush) {
+      flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                       max_size,
+                                                       warp_id,
+                                                       lane_id,
+                                                       current_idx,
+                                                       current_idx_shared,
+                                                       join_shared_l,
+                                                       join_output_l);
+    }
+    if (found_match) break;
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 07057acf37e..a2ee3a7796b 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -40,7 +41,7 @@ namespace detail {
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
@@ -74,7 +75,7 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cross_join(left, right, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 8bd42d867a3..5048da25e86 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
@@ -45,8 +46,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static auto constexpr DISTINCT_JOIN_BLOCK_SIZE = 256;
-
 template <cudf::has_nested HasNested>
 auto prepare_device_equal(
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
@@ -81,7 +80,7 @@ class build_keys_fn {
 
 /**
  * @brief Device output transform functor to construct `size_type` with `cuco::pair<hash_value_type,
- * lhs_index_type>`
+ * lhs_index_type>` or `cuco::pair<hash_value_type, rhs_index_type>`
  */
 struct output_fn {
   __device__ constexpr cudf::size_type operator()(
@@ -89,167 +88,12 @@ struct output_fn {
   {
     return static_cast<cudf::size_type>(x.second);
   }
-};
-
-template <typename Tile>
-__device__ void flush_buffer(Tile const& tile,
-                             cudf::size_type tile_count,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  cudf::size_type offset;
-  auto const lane_id = tile.thread_rank();
-  if (0 == lane_id) { offset = atomicAdd(counter, tile_count); }
-  offset = tile.shfl(offset, 0);
-
-  for (cudf::size_type i = lane_id; i < tile_count; i += tile.size()) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-  }
-}
-
-__device__ void flush_buffer(cooperative_groups::thread_block const& block,
-                             cudf::size_type buffer_size,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  auto i = block.thread_rank();
-  __shared__ cudf::size_type offset;
-
-  if (i == 0) { offset = atomicAdd(counter, buffer_size); }
-  block.sync();
-
-  while (i < buffer_size) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-
-    i += block.size();
-  }
-}
-
-// TODO: custom kernel to be replaced by cuco::static_set::retrieve
-template <typename Iter, typename HashTable>
-CUDF_KERNEL void distinct_join_probe_kernel(Iter iter,
-                                            cudf::size_type n,
-                                            HashTable hash_table,
-                                            cudf::size_type* counter,
-                                            cudf::size_type* build_indices,
-                                            cudf::size_type* probe_indices)
-{
-  namespace cg = cooperative_groups;
-
-  auto constexpr tile_size   = HashTable::cg_size;
-  auto constexpr window_size = HashTable::window_size;
-
-  auto idx          = cudf::detail::grid_1d::global_thread_id() / tile_size;
-  auto const stride = cudf::detail::grid_1d::grid_stride() / tile_size;
-  auto const block  = cg::this_thread_block();
-
-  // CG-based probing algorithm
-  if constexpr (tile_size != 1) {
-    auto const tile = cg::tiled_partition<tile_size>(block);
-
-    auto constexpr flushing_tile_size = cudf::detail::warp_size / window_size;
-    // random choice to tune
-    auto constexpr flushing_buffer_size = 2 * flushing_tile_size;
-    auto constexpr num_flushing_tiles   = DISTINCT_JOIN_BLOCK_SIZE / flushing_tile_size;
-    auto constexpr max_matches          = flushing_tile_size / tile_size;
-
-    auto const flushing_tile    = cg::tiled_partition<flushing_tile_size>(block);
-    auto const flushing_tile_id = block.thread_rank() / flushing_tile_size;
-
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type>
-      flushing_tile_buffer[num_flushing_tiles][flushing_tile_size];
-    // per flushing-tile counter to track number of filled elements
-    __shared__ cudf::size_type flushing_counter[num_flushing_tiles];
-
-    if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-    flushing_tile.sync();  // sync still needed since cg.any doesn't imply a memory barrier
-
-    while (flushing_tile.any(idx < n)) {
-      bool active_flag = idx < n;
-      auto const active_flushing_tile =
-        cg::binary_partition<flushing_tile_size>(flushing_tile, active_flag);
-      if (active_flag) {
-        auto const found = hash_table.find(tile, *(iter + idx));
-        if (tile.thread_rank() == 0 and found != hash_table.end()) {
-          auto const offset = atomicAdd_block(&flushing_counter[flushing_tile_id], 1);
-          flushing_tile_buffer[flushing_tile_id][offset] = cuco::pair{
-            static_cast<cudf::size_type>(found->second), static_cast<cudf::size_type>(idx)};
-        }
-      }
-
-      flushing_tile.sync();
-      if (flushing_counter[flushing_tile_id] + max_matches > flushing_buffer_size) {
-        flush_buffer(flushing_tile,
-                     flushing_counter[flushing_tile_id],
-                     flushing_tile_buffer[flushing_tile_id],
-                     counter,
-                     build_indices,
-                     probe_indices);
-        flushing_tile.sync();
-        if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-        flushing_tile.sync();
-      }
-
-      idx += stride;
-    }  // while
-
-    if (flushing_counter[flushing_tile_id] > 0) {
-      flush_buffer(flushing_tile,
-                   flushing_counter[flushing_tile_id],
-                   flushing_tile_buffer[flushing_tile_id],
-                   counter,
-                   build_indices,
-                   probe_indices);
-    }
-  }
-  // Scalar probing for CG size 1
-  else {
-    using block_scan = cub::BlockScan<cudf::size_type, DISTINCT_JOIN_BLOCK_SIZE>;
-    __shared__ typename block_scan::TempStorage block_scan_temp_storage;
-
-    auto constexpr buffer_capacity = 2 * DISTINCT_JOIN_BLOCK_SIZE;
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type> buffer[buffer_capacity];
-    cudf::size_type buffer_size = 0;
-
-    while (idx - block.thread_rank() < n) {  // the whole thread block falls into the same iteration
-      auto const found     = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end();
-      auto const has_match = found != hash_table.end();
-
-      // Use a whole-block scan to calculate the output location
-      cudf::size_type offset;
-      cudf::size_type block_count;
-      block_scan(block_scan_temp_storage)
-        .ExclusiveSum(static_cast<cudf::size_type>(has_match), offset, block_count);
-
-      if (buffer_size + block_count > buffer_capacity) {
-        flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-        block.sync();
-        buffer_size = 0;
-      }
-
-      if (has_match) {
-        buffer[buffer_size + offset] = cuco::pair{static_cast<cudf::size_type>(found->second),
-                                                  static_cast<cudf::size_type>(idx)};
-      }
-      buffer_size += block_count;
-      block.sync();
-
-      idx += stride;
-    }  // while
-
-    if (buffer_size > 0) {
-      flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-    }
+  __device__ constexpr cudf::size_type operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& x) const
+  {
+    return static_cast<cudf::size_type>(x.second);
   }
-}
+};
 }  // namespace
 
 template <cudf::has_nested HasNested>
@@ -309,7 +153,7 @@ template <cudf::has_nested HasNested>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr) const
+                                          rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
@@ -331,19 +175,16 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
     0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
-  auto counter = rmm::device_scalar<cudf::size_type>{stream};
-  counter.set_value_to_zero_async(stream);
-
-  cudf::detail::grid_1d grid{probe_table_num_rows, DISTINCT_JOIN_BLOCK_SIZE};
-  distinct_join_probe_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    iter,
-    probe_table_num_rows,
-    this->_hash_table.ref(cuco::find),
-    counter.data(),
-    build_indices->data(),
-    probe_indices->data());
-
-  auto const actual_size = counter.value(stream);
+
+  auto const build_indices_begin =
+    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
+  auto const probe_indices_begin =
+    thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
+
+  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
+
+  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 
@@ -352,7 +193,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
 
 template <cudf::has_nested HasNested>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
@@ -419,7 +260,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::YES>::inner_join(rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr) const
+                                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -428,7 +269,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -436,14 +277,14 @@ distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view strea
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>>
 distinct_hash_join<cudf::has_nested::YES>::left_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
 
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<cudf::has_nested::NO>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 17616818a58..b0184ff6a86 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -21,11 +21,14 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/functional.h>
@@ -157,7 +160,7 @@ probe_join_hash_table(
   null_equality compare_nulls,
   std::optional<std::size_t> output_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Use the output size directly if provided. Otherwise, compute the exact output size
   auto const probe_join_type =
@@ -267,7 +270,7 @@ std::size_t get_full_join_size(
   bool has_nulls,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::size_t join_size = compute_join_output_size(build_table,
                                                    probe_table,
@@ -396,7 +399,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::inner_join(cudf::table_view const& probe,
                               std::optional<std::size_t> output_size,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::INNER_JOIN, output_size, stream, mr);
@@ -408,7 +411,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::left_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::LEFT_JOIN, output_size, stream, mr);
@@ -420,7 +423,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::full_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::FULL_JOIN, output_size, stream, mr);
@@ -481,7 +484,7 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
 template <typename Hasher>
 std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr) const
+                                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -512,7 +515,7 @@ hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,
                                       cudf::detail::join_kind join,
                                       std::optional<std::size_t> output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   // Trivial left join case - exit early
   if (_is_empty and join != cudf::detail::join_kind::INNER_JOIN) {
@@ -553,7 +556,7 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                                      cudf::detail::join_kind join,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
 
@@ -568,12 +571,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  CUDF_EXPECTS(std::equal(std::cbegin(_build),
-                          std::cend(_build),
-                          std::cbegin(probe),
-                          std::cend(probe),
-                          [](auto const& b, auto const& p) { return b.type() == p.type(); }),
-               "Mismatch in joining column data types");
+  CUDF_EXPECTS(cudf::have_same_types(_build, probe),
+               "Mismatch in joining column data types",
+               cudf::data_type_error);
 
   return probe_join_indices(probe, join, output_size, stream, mr);
 }
@@ -603,7 +603,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::inner_join(cudf::table_view const& probe,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr) const
+                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(probe, output_size, stream, mr);
 }
@@ -613,7 +613,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::left_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(probe, output_size, stream, mr);
 }
@@ -623,7 +623,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::full_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join(probe, output_size, stream, mr);
 }
@@ -642,7 +642,7 @@ std::size_t hash_join::left_join_size(cudf::table_view const& probe,
 
 std::size_t hash_join::full_join_size(cudf::table_view const& probe,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join_size(probe, stream, mr);
 }
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index ae025b1a213..bc7f09763ec 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ inner_join(table_view const& left_input,
            table_view const& right_input,
            null_equality compare_nulls,
            rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -68,7 +69,7 @@ left_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -93,7 +94,7 @@ full_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -119,7 +120,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -130,7 +131,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -141,7 +142,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9da41e296e6..31f267d5cfb 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/iterator/counting_iterator.h>
@@ -146,7 +147,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Builds the hash table based on the given `build_table`.
@@ -245,7 +246,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Device functor to determine if an index is contained in a range.
@@ -280,12 +281,21 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_r)
 {
   size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
   joined_shared_r[my_current_idx] = second;
 }
 
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             size_type* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
+
+  joined_shared_l[my_current_idx] = first;
+}
+
 template <int num_warps, cudf::size_type output_cache_size>
 __device__ void flush_output_cache(unsigned int const activemask,
                                    cudf::size_type const max_size,
@@ -299,7 +309,7 @@ __device__ void flush_output_cache(unsigned int const activemask,
                                    size_type* join_output_r)
 {
   // count how many active threads participating here which could be less than warp_size
-  int num_threads               = __popc(activemask);
+  int const num_threads         = __popc(activemask);
   cudf::size_type output_offset = 0;
 
   if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
@@ -321,6 +331,32 @@ __device__ void flush_output_cache(unsigned int const activemask,
   }
 }
 
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   cudf::size_type const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   cudf::size_type* current_idx,
+                                   cudf::size_type current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads         = __popc(activemask);
+  cudf::size_type output_offset = 0;
+
+  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    cudf::size_type thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 7fa6642b19f..8d916da9f2c 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "join_common_utils.cuh"
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -53,7 +54,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
   thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
@@ -93,7 +94,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Get array of indices that do not appear in right_indices
 
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 6223114fcd0..42e0e4f45ee 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -54,7 +55,7 @@ mixed_join(
   join_kind join_type,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const& output_size_data,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
                "The left conditional and equality tables must have the same number of rows.");
@@ -304,7 +305,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
                                null_equality compare_nulls,
                                join_kind join_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -483,7 +484,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -505,7 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -529,7 +530,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -551,7 +552,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -575,7 +576,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 5a543997a50..01e3fe09b38 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -41,12 +41,9 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
                        table_device_view build,
                        row_hash const hash_probe,
                        row_equality const equality_probe,
-                       join_kind const join_type,
                        cudf::detail::semi_map_type::device_view hash_table_view,
-                       size_type* join_output_l,
-                       cudf::ast::detail::expression_device_view device_expression_data,
-                       cudf::size_type const* join_result_offsets,
-                       bool const swap_tables)
+                       cudf::device_span<bool> left_table_keep_mask,
+                       cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -60,7 +57,7 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
 
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+  auto const outer_num_rows            = left_num_rows;
 
   cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
@@ -70,12 +67,10 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   if (outer_row_index < outer_num_rows) {
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    if ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-        (hash_table_view.contains(outer_row_index, hash_probe, equality))) {
-      *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index;
-    }
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -86,12 +81,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view left_table,
@@ -100,12 +92,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index f411d36f0a8..4ea404d451c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -27,53 +27,7 @@ namespace cudf {
 namespace detail {
 
 /**
- * @brief Computes the output size of joining the left table to the right table for semi/anti joins.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size_semi(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a semi/anti join using the combination of a hash lookup to
+ * @brief Performs a semi join using the combination of a hash lookup to
  * identify equal rows between one pair of tables and the evaluation of an
  * expression containing an arbitrary expression.
  *
@@ -91,16 +45,11 @@ __global__ void compute_mixed_join_output_size_semi(
  * @param[in] build The table with which the hash table was built.
  * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
  * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
+ * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
+ * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, bool has_nulls>
 __global__ void mixed_join_semi(table_device_view left_table,
@@ -109,12 +58,9 @@ __global__ void mixed_join_semi(table_device_view left_table,
                                 table_device_view build,
                                 row_hash const hash_probe,
                                 row_equality const equality_probe,
-                                join_kind const join_type,
                                 cudf::detail::semi_map_type::device_view hash_table_view,
-                                size_type* join_output_l,
-                                cudf::ast::detail::expression_device_view device_expression_data,
-                                cudf::size_type const* join_result_offsets,
-                                bool const swap_tables);
+                                cudf::device_span<bool> left_table_keep_mask,
+                                cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index edf6c32eadf..8500b248fcf 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -92,9 +93,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   join_kind join_type,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
@@ -107,12 +107,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto const right_num_rows{right_conditional.num_rows()};
   auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+  auto const outer_num_rows{left_num_rows};
 
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
@@ -155,8 +150,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // TODO: The non-conditional join impls start with a dictionary matching,
   // figure out what that is and what it's needed for (and if conditional joins
   // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
+  auto& probe                 = left_equality;
+  auto& build                 = right_equality;
   auto probe_view             = table_device_view::create(probe, stream);
   auto build_view             = table_device_view::create(build, stream);
   auto left_conditional_view  = table_device_view::create(left_conditional, stream);
@@ -197,8 +192,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
@@ -225,84 +219,14 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto hash_table_view = hash_table.get_device_view();
 
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-  join_kind const kernel_join_type =
-    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
-
-  // If the join size data was not provided as an input, compute it here.
-  std::size_t join_size;
-  // Using an optional because we only need to allocate a new vector if one was
-  // not passed as input, and rmm::device_uvector is not default constructible
-  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
-  device_span<size_type const> matches_per_row_span{};
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  if (output_size_data.has_value()) {
-    join_size            = output_size_data->first;
-    matches_per_row_span = output_size_data->second;
-  } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-    matches_per_row =
-      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-    // Note that the view goes out of scope after this else statement, but the
-    // data owned by matches_per_row stays alive so the data pointer is valid.
-    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    matches_per_row_span = cudf::device_span<size_type const>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    if (has_nulls) {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    } else {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    }
-    join_size = size.value(stream);
-  }
-
-  if (join_size == 0) { return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr); }
-
-  // Given the number of matches per row, we need to compute the offsets for insertion.
-  auto join_result_offsets =
-    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-  thrust::exclusive_scan(rmm::exec_policy{stream},
-                         matches_per_row_span.begin(),
-                         matches_per_row_span.end(),
-                         join_result_offsets.begin());
-
-  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
-  auto const& join_output_l = left_indices->data();
+  // Vector used to indicate indices from left/probe table which are present in output
+  auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
   if (has_nulls) {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
@@ -313,12 +237,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   } else {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -328,235 +249,30 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   }
 
-  return left_indices;
-}
-
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
-compute_mixed_join_output_size_semi(table_view const& left_equality,
-                                    table_view const& right_equality,
-                                    table_view const& left_conditional,
-                                    table_view const& right_conditional,
-                                    ast::expression const& binary_predicate,
-                                    null_equality compare_nulls,
-                                    join_kind join_type,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(
-    (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
-      (join_type != join_kind::FULL_JOIN),
-    "Inner, left, and full join size estimation should use compute_mixed_join_output_size.");
-
-  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
-               "The left conditional and equality tables must have the same number of rows.");
-  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
-               "The right conditional and equality tables must have the same number of rows.");
-
-  auto const right_num_rows{right_conditional.num_rows()};
-  auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
-
-  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<std::size_t>(outer_num_rows), stream, mr);
-  auto matches_per_row_span = cudf::device_span<size_type>{
-    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-
-  // We can immediately filter out cases where one table is empty. In
-  // some cases, we return all the rows of the other table with a corresponding
-  // null index for the empty table; in others, we return an empty output.
-  if (right_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, and full all return all the row indices from left
-      // with a corresponding NULL from the right.
-      case join_kind::LEFT_ANTI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
-        return {left_num_rows, std::move(matches_per_row)};
-      }
-      // Inner and left semi joins return empty output because no matches can exist.
-      case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)};
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  } else if (left_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, left semi, and inner joins all return empty sets.
-      case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::LEFT_SEMI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
-        return {0, std::move(matches_per_row)};
-      }
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  }
-
-  // If evaluating the expression may produce null outputs we create a nullable
-  // output column and follow the null-supporting expression evaluation code
-  // path.
-  auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
-    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
-
-  auto const parser = ast::detail::expression_parser{
-    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
-  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
-               "The expression must produce a boolean output.");
-
-  // TODO: The non-conditional join impls start with a dictionary matching,
-  // figure out what that is and what it's needed for (and if conditional joins
-  // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
-  auto probe_view             = table_device_view::create(probe, stream);
-  auto build_view             = table_device_view::create(build, stream);
-  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
-  auto right_conditional_view = table_device_view::create(right_conditional, stream);
-
-  auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
-  auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
-  auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
-  auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
-
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
-  // Create hash table containing all keys found in right table
-  // TODO: To add support for nested columns we will need to flatten in many
-  // places. However, this probably isn't worth adding any time soon since we
-  // won't be able to support AST conditions for those types anyway.
-  auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
-  auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
-  // Since we may see multiple rows that are identical in the equality tables
-  // but differ in the conditional tables, the equality comparator used for
-  // insertion must account for both sets of tables. An alternative solution
-  // would be to use a multimap, but that solution would store duplicates where
-  // equality and conditional rows are equal, so this approach is preferable.
-  // One way to make this solution even more efficient would be to only include
-  // the columns of the conditional table that are used by the expression, but
-  // that requires additional plumbing through the AST machinery and is out of
-  // scope for now.
-  auto const row_comparator_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_build};
-  auto const equality_build_equality =
-    row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
-  auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
-  auto const row_comparator_conditional_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
-                                                            preprocessed_build_condtional};
-  auto const equality_build_conditional =
-    row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
-
-  // skip rows that are null here.
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
-  } else {
-    thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] =
-      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
-  }
-
-  auto hash_table_view = hash_table.get_device_view();
-
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-  auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
-  auto const hash_probe = row_hash.device_hasher(has_nulls);
-
-  // Determine number of output rows without actually building the output to simply
-  // find what the size of the output will be.
-  if (has_nulls) {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  } else {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  }
-
-  return {size.value(stream), std::move(matches_per_row)};
+  auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
+
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(probe.num_rows()),
+                    left_table_keep_mask.begin(),
+                    gather_map->begin(),
+                    [join_type] __device__(bool keep_row) {
+                      return keep_row == (join_type == detail::join_kind::LEFT_SEMI_JOIN);
+                    });
+
+  gather_map->resize(thrust::distance(gather_map->begin(), gather_map_end), stream);
+  return gather_map;
 }
 
 }  // namespace detail
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_SEMI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -564,8 +280,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
@@ -575,32 +290,10 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_ANTI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -608,8 +301,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
@@ -619,7 +311,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
deleted file mode 100644
index 7a22ac60710..00000000000
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_evaluator.cuh>
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace detail {
-
-namespace cg = cooperative_groups;
-
-#pragma GCC diagnostic ignored "-Wattributes"
-
-template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size_semi(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::semi_map_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
-{
-  // The (required) extern storage of the shared memory array leads to
-  // conflicting declarations between different templates. The easiest
-  // workaround is to declare an arbitrary (here char) array type then cast it
-  // after the fact to the appropriate type.
-  extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage =
-    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
-
-  std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
-
-  // TODO: Address asymmetry in operator.
-  auto equality = single_expression_equality<has_nulls>{
-    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
-
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
-       outer_row_index += stride) {
-    matches_per_row[outer_row_index] =
-      ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-       (hash_table_view.contains(outer_row_index, hash_probe, equality)));
-    thread_counter += matches_per_row[outer_row_index];
-  }
-
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
-
-  // Add block counter to global counter
-  if (threadIdx.x == 0) {
-    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
-  }
-}
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index b0e5282d97f..91d98d5e8d3 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -47,7 +48,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
@@ -97,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
@@ -108,7 +109,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index ff42d9c8620..d1a1097de35 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -37,6 +37,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 #include <thrust/pair.h>
@@ -977,7 +978,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // preprocess the json_path into a command buffer
   auto preprocess = build_command_buffer(json_path, stream);
@@ -1062,7 +1063,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_json_object(col, json_path, options, stream, mr);
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 9fecaa1ddb2..7ee1d540831 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -26,11 +26,13 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -110,7 +112,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& left_edges,
                                    column_view const& right_edges,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
@@ -176,7 +178,7 @@ struct bin_type_dispatcher {
     column_view const& right_edges,
     inclusive right_inclusive,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::YES))
       return label_bins<T, thrust::less_equal<T>, thrust::less_equal<T>>(
@@ -204,11 +206,13 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
-  CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
-               "The input and edge columns must have the same types.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, left_edges) && cudf::have_same_types(input, right_edges),
+    "The input and edge columns must have the same types.",
+    cudf::data_type_error);
   CUDF_EXPECTS(left_edges.size() == right_edges.size(),
                "The left and right edge columns must be of the same length.");
   CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(),
@@ -237,7 +241,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::label_bins(
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 579ad8e7dff..58ec053712d 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -52,7 +53,7 @@ namespace {
 std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
                                                       bool build_null_mask,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -119,7 +120,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -174,7 +175,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
                                             size_type num_rows,
                                             size_type num_output_entries,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const child_col      = lists_column_view(input).child();
   auto const entry_col      = lists_column_view(child_col).child();
@@ -213,7 +214,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
 
 std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& input,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   // Generate offsets and validities of the output lists column.
   auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr);
@@ -247,7 +248,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::LIST,
                "Input column must be a lists column.",
@@ -274,7 +275,7 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_list_elements(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index baecef3b92d..bc1b48b11cd 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -22,10 +22,12 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -75,7 +77,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                                          concatenate_null_policy null_policy,
                                          device_span<size_type const> row_null_counts,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // outgoing offsets.
   auto offsets = cudf::make_fixed_width_column(
@@ -194,7 +196,7 @@ rmm::device_uvector<size_type> generate_null_counts(table_device_view const& inp
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column.");
 
@@ -203,12 +205,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
     std::all_of(input.begin(),
                 input.end(),
                 [](column_view const& col) { return col.type().id() == cudf::type_id::LIST; }),
-    "All columns of the input table must be of lists column type.");
-  CUDF_EXPECTS(
-    std::all_of(std::next(input.begin()),
-                input.end(),
-                [a = *input.begin()](column_view const& b) { return column_types_equal(a, b); }),
-    "The types of entries in the input columns must be the same.");
+    "All columns of the input table must be of list column type.",
+    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "The types of entries in the input columns must be the same.",
+               cudf::data_type_error);
 
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
@@ -314,7 +315,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_rows(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 378cf678f1f..f03d394d6d7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -27,9 +27,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -184,7 +186,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
                                           column_view const& search_keys,
                                           duplicate_find_option find_option,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::type_dispatcher(search_keys.type(), is_supported_type_fn{}),
                "Unsupported type in `dispatch_index_of` function.");
@@ -193,7 +195,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
   // comparisons.
   auto const child = lists.child();
 
-  CUDF_EXPECTS(child.type() == search_keys.type(),
+  CUDF_EXPECTS(cudf::have_same_types(child, search_keys),
                "Type/Scale of search key does not match list column element type.",
                cudf::data_type_error);
   CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
@@ -245,7 +247,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
  */
 std::unique_ptr<column> to_contains(std::unique_ptr<column>&& key_positions,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(key_positions->type().id() == type_to_id<size_type>(),
                "Expected input column of type cudf::size_type.");
@@ -274,7 +276,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (!search_key.is_valid(stream)) {
     return make_numeric_column(
@@ -298,7 +300,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -308,7 +310,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto key_indices = detail::index_of(lists,
                                       search_key,
@@ -321,7 +323,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -336,7 +338,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
@@ -370,7 +372,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_key, stream, mr);
@@ -379,7 +381,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_keys, stream, mr);
@@ -387,7 +389,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_nulls(lists, stream, mr);
@@ -397,7 +399,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_key, find_option, stream, mr);
@@ -407,7 +409,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_keys, find_option, stream, mr);
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 5407b88236f..3d609a262b9 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -56,7 +57,7 @@ namespace {
 std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns,
                                       size_type total_list_count,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   // outgoing offsets
   auto merged_offsets = cudf::make_fixed_width_column(
@@ -96,7 +97,7 @@ std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   std::vector<lists_column_view> lists_columns;
   lists_columns.reserve(columns.size());
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 2d3826c8004..162c6140656 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (lists.is_empty() or start == end) { return cudf::empty_like(lists.parent()); }
   if (end < 0 || end > lists.size()) end = lists.size();
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index bd270b69656..cadeb273a65 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -92,7 +93,7 @@ struct list_gatherer {
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
@@ -121,7 +122,7 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
 std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 1ec66b4f98e..b754fef24e5 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -23,6 +23,8 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -54,7 +56,7 @@ std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
   cudf::detail::lists_column_device_view const& target_lists,
   size_type num_child_rows,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto is_valid_predicate = [d_list_vector  = parent_list_vector.begin(),
                              d_offsets      = parent_list_offsets.template data<size_type>(),
@@ -160,7 +162,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -219,7 +221,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -282,7 +284,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -378,7 +380,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto const source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -468,7 +470,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
                                                       list_child_constructor{},
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 156f868c5bd..89b1a126fc5 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                                          lists_column_view const& gather_map,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_index_type(gather_map.child().type()),
                "Gather map should be list column of index type");
@@ -120,7 +121,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr);
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 2fd0851067a..19c434d10e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto device_column = cudf::column_device_view::create(input.parent(), stream);
   auto d_column      = *device_column;
@@ -74,7 +75,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_elements(input, stream, mr);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 5f1d30321a2..370d7480578 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -59,7 +60,7 @@ std::unique_ptr<table> build_table(
   thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
   thrust::optional<rmm::device_uvector<size_type>> position_array,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto select_iter = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
@@ -113,7 +114,7 @@ std::unique_ptr<table> build_table(
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type const explode_column_idx,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -151,7 +152,7 @@ std::unique_ptr<table> explode(table_view const& input_table,
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type const explode_column_idx,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type const explode_column_idx,
                                      bool include_position,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child  = explode_col.get_sliced_child(stream);
@@ -299,11 +300,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -312,11 +313,11 @@ std::unique_ptr<table> explode(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -325,11 +326,11 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -340,11 +341,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 365e9ef8255..c0ce86fb56e 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -118,7 +119,7 @@ template <typename index_t>
 std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column,
                                                   index_t const& index,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_lists = lists_column.size();
   if (num_lists == 0) { return empty_like(lists_column.child()); }
@@ -174,7 +175,7 @@ std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, index, stream, mr);
 }
@@ -182,7 +183,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, indices, stream, mr);
 }
@@ -192,12 +193,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            size_type,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_list_element(lists_column, index, stream, mr);
@@ -206,12 +207,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            column_view const&,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 478b6c9a209..be8fad62412 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -22,13 +22,14 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -51,7 +52,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(table_view const& input,
                                      bool has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_cols         = input.num_columns();
   auto const num_rows         = input.num_rows();
@@ -99,7 +100,7 @@ generate_list_offsets_and_validities(table_view const& input,
  */
 std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const> columns_to_concat,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   // Concatenate all columns into a single (temporary) column.
   auto const concatenated_col =
@@ -127,12 +128,20 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
   return std::move(result->release()[0]);
 }
 
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_list_entries_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
 /**
- * @brief Compute string sizes, string validities, and interleave string lists functor.
+ * @brief Interleave array of string_index_pair objects for a list of strings
  *
- * This functor is executed twice. In the first pass, the sizes and validities of the output strings
- * will be computed. In the second pass, this will interleave the lists of strings of the given
- * table containing those lists.
+ * Each thread processes the strings for the corresponding list row
  */
 struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
@@ -140,19 +149,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
   // Store list offsets of the output lists column.
   size_type const* const dst_list_offsets;
 
-  // Flag to specify whether to compute string validities.
-  bool const has_null_mask;
-
-  // Store offsets of the strings.
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes and validities of the output strings.
-  // If d_chars != nullptr: only interleave lists of strings.
-  char* d_chars{nullptr};
-
-  // We need to set `1` or `0` for the validities of the strings in the child column.
-  int8_t* d_validities{nullptr};
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  string_index_pair* indices;  // output
 
+  // thread per list row per column
   __device__ void operator()(size_type const idx)
   {
     auto const num_cols = table_dv.num_columns();
@@ -160,7 +160,7 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const list_id  = idx / num_cols;
 
     auto const& lists_col = table_dv.column(col_id);
-    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+    if (lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
@@ -180,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
-    if (not d_chars) {  // just compute sizes and validities of strings within a list
-      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-        if (has_null_mask) {
-          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
-        }
-        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
-      }
-    } else {  // just copy the entire memory region containing all strings in the list
-      // start_byte and end_byte are indices of character of the string elements.
-      auto const start_byte = str_offsets[start_str_idx];
-      auto const end_byte   = str_offsets[end_str_idx];
-      if (start_byte < end_byte) {
-        auto const input_ptr  = str_col.template head<char>() + start_byte;
-        auto const output_ptr = d_chars + d_offsets[write_idx];
-        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+    for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+      auto const offset        = str_offsets[read_idx];
+      auto const size          = str_offsets[read_idx + 1] - offset;
+      string_index_pair result = {nullptr, size};
+      if (str_col.is_valid(read_idx)) {
+        result.first = size > 0 ? str_col.template head<char>() + offset : "";
       }
+      indices[write_idx] = result;
     }
   }
 };
 
-// Error case when no other overload or specialization is available
-template <typename T, typename Enable = void>
-struct interleave_list_entries_impl {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
-  {
-    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
-  }
-};
-
 template <typename T>
 struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<column> operator()(table_view const& input,
                                      column_view const& output_list_offsets,
                                      size_type num_output_lists,
                                      size_type num_output_entries,
-                                     bool data_has_null_mask,
+                                     bool,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input, stream);
-    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
-
-    auto validities =
-      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
-    comp_fn.d_validities = validities.data();
-
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      comp_fn, num_output_lists, num_output_entries, stream, mr);
-
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
-
-    return make_strings_column(num_output_entries,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(null_mask));
+    auto const table_dv_ptr   = table_device_view::create(input, stream);
+    auto const d_list_offsets = output_list_offsets.template begin<size_type>();
+
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
+                                                                          stream);
+    auto comp_fn =
+      compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       num_output_lists,
+                       comp_fn);
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
@@ -250,7 +225,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
     auto const num_cols     = input.num_columns();
     auto const table_dv_ptr = table_device_view::create(input, stream);
@@ -329,7 +304,7 @@ struct interleave_list_entries_fn {
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return interleave_list_entries_impl<T>{}(input,
                                              output_list_offsets,
@@ -350,7 +325,7 @@ struct interleave_list_entries_fn {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const entry_type = lists_column_view(*input.begin()).child().type();
   for (auto const& col : input) {
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 278e5af07b2..66ad1c35c33 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/sequence.h>
@@ -36,7 +37,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   if (size == 0) {
     return make_lists_column(0,
@@ -84,7 +85,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
   auto child   = make_empty_column(child_type);
@@ -95,7 +96,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto offsets = [&] {
     auto offsets_buff =
@@ -120,7 +121,7 @@ std::unique_ptr<column> make_lists_column(size_type num_rows,
                                           size_type null_count,
                                           rmm::device_buffer&& null_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); }
   CUDF_EXPECTS(
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index 6c00f8b64b4..d913ce070ae 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -36,7 +37,7 @@ namespace detail {
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
 
@@ -88,7 +89,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 0b70773f4b2..f920fb916eb 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -43,7 +44,7 @@ namespace {
  */
 std::unique_ptr<column> build_output_offsets(lists_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto output_offset = make_numeric_column(
     input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
@@ -63,7 +64,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return empty_like(input.parent());
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.parent()); }
 
@@ -120,7 +121,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_lists(input, column_order, null_precedence, stream, mr);
@@ -130,7 +131,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index f92ba782da7..7d57d8ddb60 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,12 @@
 #include <cudf/lists/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -88,7 +91,7 @@ struct sequences_dispatcher {
                                      std::optional<column_view> const& steps,
                                      size_type const* offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return sequences_functor<T>::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr);
   }
@@ -108,7 +111,7 @@ struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
                                         std::optional<column_view> const& steps,
                                         size_type const* offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto result =
       make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr);
@@ -132,21 +135,24 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   std::optional<column_view> const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(),
                "starts and sizes input columns must not have nulls.");
   CUDF_EXPECTS(starts.size() == sizes.size(),
                "starts and sizes input columns must have the same number of rows.");
-  CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");
+  CUDF_EXPECTS(cudf::is_index_type(sizes.type()),
+               "Input sizes column must be of integer types.",
+               cudf::data_type_error);
 
   if (steps) {
     auto const& steps_cv = steps.value();
     CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
     CUDF_EXPECTS(starts.size() == steps_cv.size(),
                  "starts and steps input columns must have the same number of rows.");
-    CUDF_EXPECTS(starts.type() == steps_cv.type(),
-                 "starts and steps input columns must have the same type.");
+    CUDF_EXPECTS(cudf::have_same_types(starts, steps_cv),
+                 "starts and steps input columns must have the same type.",
+                 cudf::data_type_error);
   }
 
   auto const n_lists = starts.size();
@@ -190,7 +196,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::nullopt, sizes, stream, mr);
 }
@@ -199,7 +205,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::optional<column_view>{steps}, sizes, stream, mr);
 }
@@ -209,7 +215,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, sizes, stream, mr);
@@ -219,7 +225,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, steps, sizes, stream, mr);
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5735c84e3d3..1d18b8c677c 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -51,7 +52,7 @@ namespace {
 void check_compatibility(lists_column_view const& lhs, lists_column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "The input lists column must have the same size.");
-  CUDF_EXPECTS(column_types_equal(lhs.child(), rhs.child()),
+  CUDF_EXPECTS(have_same_types(lhs.child(), rhs.child()),
                "The input lists columns must have children having the same type structure");
 }
 
@@ -62,7 +63,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -132,7 +133,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -193,7 +194,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -214,7 +215,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -279,7 +280,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -290,7 +291,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -301,7 +302,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index ce972d89150..71aafa3ce12 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -39,7 +40,7 @@ namespace detail {
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.child().type().id() == type_id::BOOL8, "Mask must be of type BOOL8.");
   CUDF_EXPECTS(input.size() == boolean_mask.size(),
@@ -102,7 +103,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index c8d9c15706f..40dee010bd5 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -37,7 +38,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Algorithm:
   // - Generate labels for the child elements.
@@ -77,7 +78,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(input, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 2c4966c969e..7fb960f02ca 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,14 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/labeling/label_segments.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto labels = make_numeric_column(
     data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream, mr);
@@ -38,7 +40,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 
 {
   auto out_offsets = make_numeric_column(
@@ -56,7 +58,7 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
 
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.offsets()); }
 
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index c881e828677..218ad7872e9 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -36,7 +37,7 @@ namespace cudf::lists::detail {
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Reconstruct an offsets column from the input list labels column.
@@ -50,7 +51,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate 0-based list offsets from the offsets of the input lists column.
@@ -62,6 +63,6 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
  */
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 8be503025bd..630cf328579 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,17 +27,19 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/merge.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -347,7 +349,7 @@ struct column_merger {
   std::unique_ptr<column> operator()(column_view const&,
                                      column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Unsupported type for merge.");
   }
@@ -359,7 +361,7 @@ struct column_merger {
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
@@ -431,20 +433,10 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
-  auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
-                                                   strings_column_view(rcol),
-                                                   row_order_.begin(),
-                                                   row_order_.end(),
-                                                   stream,
-                                                   mr);
-  if (lcol.has_nulls() || rcol.has_nulls()) {
-    auto merged_view = column->mutable_view();
-    materialize_bitmask(
-      lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream);
-  }
-  return column;
+  return strings::detail::merge(
+    strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
 }
 
 // specialization for dictionary
@@ -453,7 +445,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto result = cudf::dictionary::detail::merge(
     cudf::dictionary_column_view(lcol), cudf::dictionary_column_view(rcol), row_order_, stream, mr);
@@ -473,7 +465,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::list_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   std::vector<column_view> columns{lcol, rcol};
   auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr);
@@ -501,7 +493,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   // merge each child.
   auto const lhs = structs_column_view{lcol};
@@ -550,7 +542,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   // collect index columns for lhs, rhs, resp.
   //
@@ -620,7 +612,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   if (tables_to_merge.empty()) { return std::make_unique<cudf::table>(); }
 
@@ -702,7 +694,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::size_type> const& key_cols,
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::merge(
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0d2daaddb8c..f10388794fc 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cub/device/device_histogram.cuh>
@@ -413,7 +414,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
 
@@ -441,7 +442,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // Use move_to_output_buffer to create an equivalent gather map
     auto gather_map = compute_gather_map(input.size(),
@@ -471,7 +472,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   size_type num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const num_rows = table_to_hash.num_rows();
 
@@ -658,7 +659,7 @@ struct dispatch_map_type {
              column_view const& partition_map,
              size_type num_partitions,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     // Build a histogram of the number of rows in each partition
     rmm::device_uvector<size_type> histogram(num_partitions + 1, stream);
@@ -761,7 +762,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   int num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto table_to_hash = input.select(columns_to_hash);
 
@@ -785,7 +786,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   column_view const& partition_map,
   size_type num_partitions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(t.num_rows() == partition_map.size(),
                "Size mismatch between table and partition map.");
@@ -809,7 +810,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   hash_id hash_function,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -833,7 +834,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 3283a7c35ee..82b169c78ed 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -83,7 +84,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -157,7 +158,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -270,8 +271,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index cba7203483b..b25254cfe49 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -53,7 +54,7 @@ struct quantile_functor {
   interpolation interp;
   bool retain_types;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename T>
   std::enable_if_t<not std::is_arithmetic_v<T> and not cudf::is_fixed_point<T>(),
@@ -145,7 +146,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  bool retain_types,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto functor = quantile_functor<exact, SortMapIterator>{
     ordered_indices, size, q, interp, retain_types, stream, mr};
@@ -163,7 +164,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (indices.is_empty()) {
     auto begin = thrust::make_counting_iterator<size_type>(0);
@@ -193,7 +194,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 8fee821dfc4..c0f536536ce 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -43,7 +44,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<double> const& q,
                                  interpolation interp,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto quantile_idx_lookup = cuda::proclaim_return_type<size_type>(
     [sortmap, interp, size = input.num_rows()] __device__(double q) {
@@ -71,7 +72,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (q.empty()) { return empty_like(input); }
 
@@ -99,7 +100,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  cudf::sorted is_input_sorted,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantiles(input,
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 5efafdd0be6..47864c25c5f 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -46,8 +47,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
   // Underflow may occur when converting int64 to double
   // detail: https://github.com/rapidsai/cudf/issues/1417
 
-  auto dlhs             = static_cast<double>(lhs);
-  auto drhs             = static_cast<double>(rhs);
+  auto dlhs             = convert_to_floating<double>(lhs);
+  auto drhs             = convert_to_floating<double>(rhs);
   double one_minus_frac = 1.0 - frac;
   return static_cast<Result>(one_minus_frac * dlhs + frac * drhs);
 }
@@ -56,8 +57,8 @@ template <typename Result, typename T>
 CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
-  auto dlhs = static_cast<double>(lhs);
-  auto drhs = static_cast<double>(rhs);
+  auto dlhs = convert_to_floating<double>(lhs);
+  auto drhs = convert_to_floating<double>(rhs);
   return static_cast<Result>(dlhs / 2 + drhs / 2);
 }
 
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 96b0355c6e5..da36b7ab1da 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -184,7 +185,7 @@ CUDF_KERNEL void compute_percentiles_kernel(device_span<size_type const> tdigest
 std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& input,
                                                    column_view const& percentiles,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -259,7 +260,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest_offsets->size() == num_rows + 1,
                "Encountered unexpected offset count in make_tdigest_column");
@@ -291,7 +292,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
 }
 
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
@@ -334,7 +335,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
@@ -346,7 +347,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
   CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
@@ -407,7 +408,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 56e1bfbe003..229af89fc46 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -28,10 +28,12 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -73,7 +75,7 @@ struct make_centroid {
   centroid operator() __device__(size_type index) const
   {
     auto const is_valid = col.is_valid(index);
-    auto const mean     = is_valid ? static_cast<double>(col.element<T>(index)) : 0.0;
+    auto const mean     = is_valid ? convert_to_floating<double>(col.element<T>(index)) : 0.0;
     auto const weight   = is_valid ? 1.0 : 0.0;
     return {mean, weight, is_valid};
   }
@@ -87,7 +89,7 @@ struct make_centroid_no_nulls {
 
   centroid operator() __device__(size_type index) const
   {
-    return {static_cast<double>(col.element<T>(index)), 1.0, true};
+    return {convert_to_floating<double>(col.element<T>(index)), 1.0, true};
   }
 };
 
@@ -332,7 +334,7 @@ __device__ double scale_func_k1(double quantile, double delta_norm)
 // convert a single-row tdigest column to a scalar.
 std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest->size() == 1,
                "Encountered invalid tdigest column when converting to scalar");
@@ -516,7 +518,7 @@ generate_group_cluster_info(int delta,
                             CumulativeWeight cumulative_weight,
                             bool has_nulls,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(num_groups, block_size);
@@ -580,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& max_col,
                                             bool has_nulls,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // whether or not this weight is a stub
   auto is_stub_weight = [weights = weights->view().begin<double>()] __device__(size_type i) {
@@ -731,7 +733,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          size_type total_clusters,
                                          bool has_nulls,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // the output for each group is a column of data that represents the tdigest. since we want 1 row
   // per group, each row will be a list the length of the tdigest for that group. so our output
@@ -808,8 +810,9 @@ struct get_scalar_minmax_grouped {
     auto const valid_count = group_valid_counts[group_index];
     return valid_count > 0
              ? thrust::make_tuple(
-                 static_cast<double>(col.element<T>(group_offsets[group_index])),
-                 static_cast<double>(col.element<T>(group_offsets[group_index] + valid_count - 1)))
+                 convert_to_floating<double>(col.element<T>(group_offsets[group_index])),
+                 convert_to_floating<double>(
+                   col.element<T>(group_offsets[group_index] + valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
@@ -823,8 +826,8 @@ struct get_scalar_minmax {
   __device__ thrust::tuple<double, double> operator()(size_type)
   {
     return valid_count > 0
-             ? thrust::make_tuple(static_cast<double>(col.element<T>(0)),
-                                  static_cast<double>(col.element<T>(valid_count - 1)))
+             ? thrust::make_tuple(convert_to_floating<double>(col.element<T>(0)),
+                                  convert_to_floating<double>(col.element<T>(valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
@@ -839,7 +842,7 @@ struct typed_group_tdigest {
                                      size_type num_groups,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // first, generate cluster weight information for each input group
     auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
@@ -905,7 +908,7 @@ struct typed_reduce_tdigest {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // treat this the same as the groupby path with a single group.  Note:  even though
     // there is only 1 group there are still multiple keys within the group that represent
@@ -1027,7 +1030,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        size_type num_groups,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
   // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
@@ -1209,7 +1212,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
@@ -1232,7 +1235,7 @@ struct group_offsets_fn {
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -1262,7 +1265,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
@@ -1285,7 +1288,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 6cea4e4ada3..11b0e2732fe 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction all()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "all() operation can be applied with output type `BOOL8` only");
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index c0c044a1e6f..0ebeb7a48b9 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction any()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "any() operation can be applied with output type `bool8` only");
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index 743eddbffaf..c1a1f117ee1 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -45,7 +47,7 @@ bool need_handle_nulls(column_view const& input, null_policy null_handling)
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (need_handle_nulls(col, null_handling)) {
     auto d_view             = column_device_view::create(col, stream);
@@ -61,7 +63,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
 
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto flatten_col = col.get_sliced_child(stream);
   return make_list_scalar(flatten_col, stream, mr);
@@ -72,7 +74,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // `input_as_collect_list` is the result of the input column that has been processed to obey
   // the given null handling behavior.
@@ -101,7 +103,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto flatten_col    = col.get_sliced_child(stream);
   auto distinct_table = cudf::detail::distinct(table_view{{flatten_col}},
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 3428130d912..aa71546f049 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -48,7 +50,7 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const valid_count = col.size() - col.null_count();
 
@@ -101,7 +103,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_reduction<ElementType, ResultType, Op>(col, output_dtype, ddof, stream, mr);
   }
@@ -111,7 +113,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -134,7 +136,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(
       output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
@@ -145,7 +147,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL(
       "Reduction operators other than `min` and `max`"
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 3e46a34cc6a..bebb9d14923 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -21,6 +21,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -114,7 +116,7 @@ auto gather_histogram(table_view const& input,
                       device_span<size_type const> distinct_indices,
                       std::unique_ptr<column>&& distinct_counts,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto distinct_rows = cudf::detail::gather(input,
                                             distinct_indices,
@@ -152,7 +154,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<colum
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
@@ -236,7 +238,7 @@ compute_row_frequencies(table_view const& input,
 
 std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
@@ -249,7 +251,7 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
 
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 1cf2b6f53b6..682889f0fee 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> max(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e64660932ce..e8a10f02cc1 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> mean(column_view const& col,
                                    cudf::data_type const output_dtype,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 792965e8b99..7986bda5751 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -26,7 +28,7 @@ std::unique_ptr<cudf::scalar> min(column_view const& col,
                                   data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index c4eb09110c6..2c1181972c5 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
@@ -190,7 +191,7 @@ struct minmax_functor {
             std::enable_if_t<is_supported<T>() and !std::is_same_v<T, cudf::string_view> and
                              !cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     using storage_type = device_storage_type_t<T>;
     // compute minimum and maximum values
@@ -210,7 +211,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<cudf::string_view>(col, stream);
@@ -229,7 +230,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -246,7 +247,7 @@ struct minmax_functor {
 
   template <typename T, std::enable_if_t<!is_supported<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    cudf::column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("type not supported for minmax() operation");
   }
@@ -260,7 +261,7 @@ struct minmax_functor {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (col.null_count() == col.size()) {
     // this handles empty and all-null columns
@@ -274,10 +275,10 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const& col, rmm::mr::device_memory_resource* mr)
+  column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minmax(col, cudf::get_default_stream(), mr);
+  return detail::minmax(col, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 88a1778bb7b..e266f477c5d 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<cudf::scalar> nth_element(column_view const& col,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 2e483813939..28ff8db3708 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> product(column_view const& col,
                                       cudf::data_type const output_dtype,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cd1669d1d6b..8fa036a0949 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -28,8 +28,11 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -38,14 +41,14 @@ struct reduce_dispatch_functor {
   column_view const col;
   data_type output_dtype;
   std::optional<std::reference_wrapper<scalar const>> init;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
   rmm::cuda_stream_view stream;
 
   reduce_dispatch_functor(column_view const& col,
                           data_type output_dtype,
                           std::optional<std::reference_wrapper<scalar const>> init,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
     : col(col), output_dtype(output_dtype), init(init), mr(mr), stream(stream)
   {
   }
@@ -151,10 +154,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(col, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
@@ -204,20 +208,21 @@ std::unique_ptr<scalar> reduce(column_view const& col,
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(
-    col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, init, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 538763099d3..0befb6ac7d7 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
@@ -66,7 +67,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        value_resolver resolver,
                                        scan_operator scan_op,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const order_by_tview = table_view{{order_by}};
   auto comp = cudf::experimental::row::equality::self_comparator(order_by_tview, stream);
@@ -105,7 +106,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   return rank_generator(
     order_by,
@@ -117,7 +118,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in rank scan.");
@@ -130,7 +131,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
 }
 
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const rank_column =
     inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 2871ee283ba..de4dcf1de52 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_type inclusive,
                              null_policy null_handling,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   if (agg.kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
@@ -58,10 +60,11 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
+  return detail::scan(input, agg, inclusive, null_handling, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index e575bde0ce0..aeb9e516cd4 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -33,14 +34,14 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 
 template <template <typename> typename DispatchFn>
 std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
                                           scan_aggregation const& agg,
                                           bitmask_type const* output_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   switch (agg.kind) {
     case aggregation::SUM:
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 47301ad91f6..7224bf47390 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/scan.h>
@@ -56,7 +57,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const*,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto output_column =
       detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -89,7 +90,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 7edf89a0c91..ad2eaa6a471 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/find.h>
 #include <thrust/functional.h>
@@ -45,7 +46,7 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   rmm::device_buffer mask =
     detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
@@ -74,7 +75,7 @@ struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto output_column = detail::allocate_like(
       input_view, input_view.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -99,7 +100,7 @@ struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const* mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
@@ -110,7 +111,7 @@ struct scan_functor<Op, cudf::struct_view> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
@@ -150,7 +151,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const* output_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scan_functor<Op, T>::invoke(input, output_mask, stream, mr);
   }
@@ -168,7 +169,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index b81a088155c..489fc6a283c 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_all(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_all() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index 9210fbd3c7c..a9a8528548a 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_any(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_any() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index 395ad4c1dc9..035a8bdcd75 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -54,7 +56,7 @@ std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto d_col              = cudf::column_device_view::create(col, stream);
   auto compound_op        = Op{};
@@ -109,7 +111,7 @@ struct compound_float_output_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_segmented_reduction<ElementType, ResultType, Op>(
       col, offsets, null_handling, ddof, stream, mr);
@@ -121,7 +123,7 @@ struct compound_float_output_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -144,7 +146,7 @@ struct compound_segmented_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(output_dtype,
                                  compound_float_output_dispatcher<ElementType, Op>(),
@@ -163,7 +165,7 @@ struct compound_segmented_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Compound operators are not supported for non-arithmetic types");
   }
diff --git a/cpp/src/reductions/segmented/counts.cu b/cpp/src/reductions/segmented/counts.cu
index b9064ad3ffe..79737828678 100644
--- a/cpp/src/reductions/segmented/counts.cu
+++ b/cpp/src/reductions/segmented/counts.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/null_mask.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 
 namespace cudf {
@@ -29,7 +31,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto const num_segments = offsets.size() - 1;
 
diff --git a/cpp/src/reductions/segmented/counts.hpp b/cpp/src/reductions/segmented/counts.hpp
index c5ee1fadae7..f249644e564 100644
--- a/cpp/src/reductions/segmented/counts.hpp
+++ b/cpp/src/reductions/segmented/counts.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 class column_device_view;
@@ -48,7 +49,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index c07c8fb2269..1c79edcc08c 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_max(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_max() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index 99f1533a154..8df6bee97e9 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   using reducer            = compound::detail::compound_segmented_dispatcher<op::mean>;
   constexpr size_type ddof = 1;  // ddof for mean calculation
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index f1597f90267..ae1d5ae42a4 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_min(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_min() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
index bd1efb41df8..d4fcf89e161 100644
--- a/cpp/src/reductions/segmented/nunique.cu
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::column> segmented_nunique(column_view const& col,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // only support non-nested types
   CUDF_EXPECTS(!cudf::is_nested(col.type()),
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index ea9c6f484c0..1b82e7e5aec 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -28,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_product(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::product>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index cee82560794..48ab5963a29 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -35,7 +37,7 @@ struct segmented_reduce_dispatch_functor {
   null_policy null_handling;
   std::optional<std::reference_wrapper<scalar const>> init;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   segmented_reduce_dispatch_functor(column_view const& segmented_values,
                                     device_span<size_type const> offsets,
@@ -43,7 +45,7 @@ struct segmented_reduce_dispatch_functor {
                                     null_policy null_handling,
                                     std::optional<std::reference_wrapper<scalar const>> init,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : col(segmented_values),
       offsets(offsets),
       output_dtype(output_dtype),
@@ -59,7 +61,7 @@ struct segmented_reduce_dispatch_functor {
                                     data_type output_dtype,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : segmented_reduce_dispatch_functor(
         segmented_values, offsets, output_dtype, null_handling, std::nullopt, stream, mr)
   {
@@ -109,10 +111,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(segmented_values, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
@@ -135,17 +138,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          segmented_reduce_aggregation const& agg,
                                          data_type output_dtype,
                                          null_policy null_handling,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             std::nullopt,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
@@ -154,17 +152,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          data_type output_dtype,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             init,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, init, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 4d4c6661428..da59df6b314 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -33,6 +33,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -71,7 +72,7 @@ std::unique_ptr<column> simple_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dcol               = cudf::column_device_view::create(col, stream);
   auto simple_op          = Op{};
@@ -157,7 +158,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // Pass to simple_segmented_reduction, get indices to gather, perform gather here.
   auto device_col = cudf::column_device_view::create(col, stream);
@@ -201,7 +202,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
 }
@@ -226,7 +227,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using RepType = device_storage_type_t<InputType>;
   auto result =
@@ -296,7 +297,7 @@ struct bool_result_column_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, bool, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -308,7 +309,7 @@ struct bool_result_column_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -341,7 +342,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -354,7 +355,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for strings"); }
 
@@ -368,7 +369,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_segmented_reduction<ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -380,7 +381,7 @@ struct same_column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -412,7 +413,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Floats are computed in double precision and then cast to the output type
     auto result = simple_segmented_reduction<ElementType, double, Op>(
@@ -439,7 +440,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Integers are computed in int64 precision and then cast to the output type.
     auto result = simple_segmented_reduction<ElementType, int64_t, Op>(
@@ -468,7 +469,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // If the output type matches the input type, then reduce using that type
     if (output_type.id() == cudf::type_to_id<ElementType>()) {
@@ -486,7 +487,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
     return fixed_point_segmented_reduction<ElementType, Op>(
@@ -502,7 +503,7 @@ struct column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 5f5ced63b8f..0a7eb007f68 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& co
                                                            null_policy null_handling,
                                                            size_type ddof,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::standard_deviation>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index 7e84961dee0..bb06f6d7c8e 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_sum(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 6c3f286fd8d..25d52f9bc79 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        cudf::data_type const output_dtype,
                                                        null_policy null_handling,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum_of_squares>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
index 7bf75d53ada..92cfe5417ef 100644
--- a/cpp/src/reductions/segmented/update_validity.cu
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -30,7 +32,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto [output_null_mask, output_null_count] = cudf::detail::segmented_null_mask_reduction(
     col.null_mask(),
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
index 0003b98308a..c143e1a4761 100644
--- a/cpp/src/reductions/segmented/update_validity.hpp
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -51,7 +52,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 4ac815b542f..35f2771dfcf 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  null_policy null_handling,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::variance>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 006c6dc3034..372ceccf60b 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -59,7 +60,7 @@ template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> simple_reduction(column_view const& col,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // reduction by iterator
   auto dcol      = cudf::column_device_view::create(col, stream);
@@ -112,7 +113,7 @@ std::unique_ptr<scalar> fixed_point_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using Type = device_storage_type_t<DecimalXX>;
 
@@ -155,7 +156,7 @@ std::unique_ptr<scalar> dictionary_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (init.has_value()) { CUDF_FAIL("Initial value not supported for dictionary reductions"); }
 
@@ -218,7 +219,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>* input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto d_input  = cudf::get_scalar_device_view(*input);
     auto result   = std::make_unique<numeric_scalar<ResultType>>(ResultType{}, true, stream, mr);
@@ -231,7 +232,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<not is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>*,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("input data type is not convertible to output data type");
   }
@@ -250,7 +251,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_reduction<ElementType, bool, Op>(col, init, stream, mr);
   }
@@ -260,7 +261,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -286,7 +287,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const& keys,
                                       scalar const& keys_index,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   {
     auto& index = static_cast<numeric_scalar<IndexType> const&>(keys_index);
     return cudf::detail::get_element(keys, index.value(stream), stream, mr);
@@ -296,7 +297,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const&,
                                       scalar const&,
                                       rmm::cuda_stream_view,
-                                      rmm::mr::device_memory_resource*)
+                                      rmm::device_async_resource_ref)
   {
     CUDF_FAIL("index type expected for dictionary column");
   }
@@ -309,7 +310,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for nested type reductions"); }
 
@@ -334,7 +335,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (!cudf::is_dictionary(col.type())) {
       return simple_reduction<ElementType, ElementType, Op>(col, init, stream, mr);
@@ -351,7 +352,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_reduction<ElementType, Op>(col, init, stream, mr);
   }
@@ -360,7 +361,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -386,7 +387,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, double, Op>(col, init, stream, mr)
@@ -409,7 +410,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, int64_t, Op>(col, init, stream, mr)
@@ -439,7 +440,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (output_type.id() == cudf::type_to_id<ElementType>())
       return !cudf::is_dictionary(col.type())
@@ -457,7 +458,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
 
@@ -471,7 +472,7 @@ struct element_type_dispatcher {
                                      data_type const,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 9df83634667..9c78b35313b 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index 85c6b32dbaf..51b251a836e 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index 7b85c4e6dc9..dc0eae56e98 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index d559531dc59..aaab9dd4604 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
                                        size_type ddof,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 3cd1fdd20a2..cb3caf9d068 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -28,14 +28,17 @@
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -51,26 +54,22 @@ namespace {
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
 struct clamp_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
   column_device_view const d_strings;
   OptionalScalarIterator lo_itr;
   ReplaceScalarIterator lo_replace_itr;
   OptionalScalarIterator hi_itr;
   ReplaceScalarIterator hi_replace_itr;
-  size_type* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
-      return;
-    }
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+
     auto const element      = d_strings.element<string_view>(idx);
     auto const d_lo         = (*lo_itr).value_or(element);
     auto const d_hi         = (*hi_itr).value_or(element);
     auto const d_lo_replace = *(*lo_replace_itr);
     auto const d_hi_replace = *(*hi_replace_itr);
-    auto d_output           = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
     auto d_str = [d_lo, d_lo_replace, d_hi, d_hi_replace, element] {
       if (element < d_lo) { return d_lo_replace; }
@@ -78,11 +77,9 @@ struct clamp_strings_fn {
       return element;
     }();
 
-    if (d_output) {
-      cudf::strings::detail::copy_string(d_output, d_str);
-    } else {
-      d_offsets[idx] = d_str.size_bytes();
-    }
+    // ensures an empty string is not converted to a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
 
@@ -93,21 +90,21 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
                                                   OptionalScalarIterator hi_itr,
                                                   ReplaceScalarIterator hi_replace_itr,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto input_device_column = column_device_view::create(input.parent(), stream);
   auto d_input             = *input_device_column;
 
   auto fn = clamp_strings_fn<OptionalScalarIterator, ReplaceScalarIterator>{
     d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr);
-
-  return make_strings_column(input.size(),
-                             std::move(offsets_column),
-                             chars.release(),
-                             input.null_count(),
-                             std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(input.size(), stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    fn);
+
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
@@ -118,7 +115,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -168,7 +165,7 @@ std::enable_if_t<std::is_same_v<T, string_view>, std::unique_ptr<cudf::column>>
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return clamp_string_column(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -182,7 +179,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               OptionalScalarIterator hi_itr,
                               ReplaceScalarIterator hi_replace_itr,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -195,9 +192,11 @@ struct dispatch_clamp {
                                      scalar const& hi,
                                      scalar const& hi_replace,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
+    CUDF_EXPECTS(cudf::have_same_types(input, lo),
+                 "mismatching types of scalar and input",
+                 cudf::data_type_error);
 
     auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
     auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
@@ -216,7 +215,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::list_view>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for list_view not supported");
 }
@@ -228,7 +227,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<struct_view>(column_view cons
                                                                 scalar const& hi,
                                                                 scalar const& hi_replace,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for struct_view not supported");
 }
@@ -241,7 +240,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // add lo_replace and hi_replace to keys
   auto matched_column = [&] {
@@ -309,7 +308,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
                                       scalar const& lo_replace,
                                       scalar const& hi,
                                       scalar const& hi_replace,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -319,11 +318,16 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
-  CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
-  CUDF_EXPECTS(lo.type() == lo_replace.type(), "mismatching types of limit and replace scalars");
+  CUDF_EXPECTS(
+    cudf::have_same_types(lo, hi), "mismatching types of limit scalars", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo_replace, hi_replace),
+               "mismatching types of replace scalars",
+               cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo, lo_replace),
+               "mismatching types of limit and replace scalars",
+               cudf::data_type_error);
 
   if ((not lo.is_valid(stream) and not hi.is_valid(stream)) or (input.is_empty())) {
     // There will be no change
@@ -350,7 +354,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo_replace, hi, hi_replace, stream, mr);
@@ -361,7 +365,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo,
                               scalar const& hi,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo, hi, hi, stream, mr);
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 2fcb934ba65..eba6f6b436e 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -44,7 +45,7 @@ struct replace_nans_functor {
     Replacement const& replacement,
     bool replacement_nullable,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(),
                  "Input and replacement must be of the same type");
@@ -84,7 +85,7 @@ struct replace_nans_functor {
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == replacement.size(),
                "Input and replacement must be of the same size");
@@ -101,7 +102,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), replace_nans_functor{}, input, replacement, true, stream, mr);
@@ -112,7 +113,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -121,7 +122,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -197,7 +198,7 @@ void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view
 
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // output. copies the input
   auto out = std::make_unique<column>(input, stream, mr);
@@ -224,7 +225,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_nans_and_zeros(input, stream, mr);
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 014171f2b40..13e130588c1 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -32,16 +32,18 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -56,63 +58,6 @@ namespace {  // anonymous
 
 static constexpr int BLOCK_SIZE = 256;
 
-template <int phase, bool replacement_has_nulls>
-CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
-                                       cudf::column_device_view replacement,
-                                       cudf::bitmask_type* output_valid,
-                                       cudf::size_type* offsets,
-                                       char* chars,
-                                       cudf::size_type* valid_counter)
-{
-  cudf::size_type nrows = input.size();
-  auto i                = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-
-  uint32_t active_mask = 0xffff'ffff;
-  active_mask          = __ballot_sync(active_mask, i < nrows);
-  auto const lane_id{threadIdx.x % cudf::detail::warp_size};
-  uint32_t valid_sum{0};
-
-  while (i < nrows) {
-    bool input_is_valid  = input.is_valid_nocheck(i);
-    bool output_is_valid = true;
-
-    if (replacement_has_nulls && !input_is_valid) {
-      output_is_valid = replacement.is_valid_nocheck(i);
-    }
-
-    cudf::string_view out;
-    if (input_is_valid) {
-      out = input.element<cudf::string_view>(i);
-    } else if (output_is_valid) {
-      out = replacement.element<cudf::string_view>(i);
-    }
-
-    bool nonzero_output = (input_is_valid || output_is_valid);
-
-    if (phase == 0) {
-      offsets[i]       = nonzero_output ? out.size_bytes() : 0;
-      uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
-      if (0 == lane_id) {
-        output_valid[cudf::word_index(i)] = bitmask;
-        valid_sum += __popc(bitmask);
-      }
-    } else if (phase == 1) {
-      if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
-    }
-
-    i += stride;
-    active_mask = __ballot_sync(active_mask, i < nrows);
-  }
-
-  // Compute total valid count for this block and add it to global count
-  uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
-  // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) {
-    atomicAdd(valid_counter, static_cast<cudf::size_type>(block_valid_count));
-  }
-}
-
 template <typename Type, bool replacement_has_nulls>
 CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
                                cudf::column_device_view replacement,
@@ -170,7 +115,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     cudf::size_type nrows = input.size();
     cudf::detail::grid_1d grid{nrows, BLOCK_SIZE};
@@ -209,7 +154,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -220,60 +165,26 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
-  rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
-  cudf::size_type* valid_count = valid_counter.data();
-
-  auto replace_first  = replace_nulls_strings<0, false>;
-  auto replace_second = replace_nulls_strings<1, false>;
-  if (replacement.has_nulls()) {
-    replace_first  = replace_nulls_strings<0, true>;
-    replace_second = replace_nulls_strings<1, true>;
+  auto d_input       = cudf::column_device_view::create(input, stream);
+  auto d_replacement = cudf::column_device_view::create(replacement, stream);
+
+  auto lhs_iter =
+    cudf::detail::make_optional_iterator<cudf::string_view>(*d_input, cudf::nullate::YES{});
+  auto rhs_iter = cudf::detail::make_optional_iterator<cudf::string_view>(
+    *d_replacement, cudf::nullate::DYNAMIC{replacement.nullable()});
+
+  auto filter = cudf::detail::validity_accessor<false>{*d_input};
+  auto result = cudf::strings::detail::copy_if_else(
+    lhs_iter, lhs_iter + input.size(), rhs_iter, filter, stream, mr);
+
+  // input is nullable so result should always be nullable here
+  if (!result->nullable()) {
+    result->set_null_mask(
+      cudf::detail::create_null_mask(input.size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
   }
-
-  // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream);
-
-  auto sizes_view         = sizes->mutable_view();
-  auto device_in          = cudf::column_device_view::create(input, stream);
-  auto device_replacement = cudf::column_device_view::create(replacement, stream);
-
-  rmm::device_buffer valid_bits =
-    cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  // Call first pass kernel to get sizes in offsets
-  cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    sizes_view.begin<cudf::size_type>(),
-    nullptr,
-    valid_count);
-
-  auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
-
-  auto offsets_view = offsets->mutable_view();
-
-  // Allocate chars array and output null mask
-  rmm::device_uvector<char> output_chars(bytes, stream, mr);
-
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    offsets_view.begin<cudf::size_type>(),
-    output_chars.data(),
-    valid_count);
-
-  return cudf::make_strings_column(input.size(),
-                                   std::move(offsets),
-                                   output_chars.release(),
-                                   input.size() - valid_counter.value(stream),
-                                   std::move(valid_bits));
+  return result;
 }
 
 template <>
@@ -281,7 +192,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   cudf::dictionary_column_view dict_repl(replacement);
@@ -304,9 +215,10 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+    CUDF_EXPECTS(
+      cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
       input, input.size(), cudf::mask_allocation_policy::NEVER, stream, mr);
     auto output_view = output->mutable_view();
@@ -329,7 +241,7 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::scalar const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -340,11 +252,12 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   cudf::strings_column_view input_s(input);
-  cudf::string_scalar const& repl = static_cast<cudf::string_scalar const&>(replacement);
+  auto const& repl = static_cast<cudf::string_scalar const&>(replacement);
   return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
@@ -353,7 +266,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
@@ -366,7 +279,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
 std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const& input,
                                                         cudf::replace_policy const& replace_policy,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto device_in = cudf::column_device_view::create(input, stream);
   auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
@@ -406,9 +319,10 @@ namespace detail {
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
 
   if (input.is_empty()) { return cudf::empty_like(input); }
@@ -421,7 +335,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
@@ -435,7 +349,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }
@@ -448,7 +362,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -457,7 +371,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -466,7 +380,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replace_policy, stream, mr);
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 88d5d3a2375..c2cd03cd761 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -45,19 +45,20 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
 #include <thrust/pair.h>
-#include <thrust/tuple.h>
 
 namespace {  // anonymous
 
@@ -87,140 +88,6 @@ __device__ auto get_new_value(cudf::size_type idx,
   return thrust::make_pair(new_value, output_is_valid);
 }
 
-__device__ int get_new_string_value(cudf::size_type idx,
-                                    cudf::column_device_view& input,
-                                    cudf::column_device_view& values_to_replace,
-                                    cudf::column_device_view&)
-{
-  cudf::string_view input_string = input.element<cudf::string_view>(idx);
-  int match                      = -1;
-  for (int i = 0; i < values_to_replace.size(); i++) {
-    cudf::string_view value_string = values_to_replace.element<cudf::string_view>(i);
-    if (input_string == value_string) {
-      match = i;
-      break;
-    }
-  }
-  return match;
-}
-
-/**
- * @brief Kernel which does the first pass of strings replace.
- *
- * It computes the output null_mask, null_count, and the offsets.
- *
- * @param input The input column to replace strings in.
- * @param values_to_replace The string values to replace.
- * @param replacement The replacement values.
- * @param offsets The column which will contain the offsets of the new string column
- * @param indices Temporary column used to store the replacement indices
- * @param output_valid The output null_mask
- * @param output_valid_count The output valid count
- */
-template <bool input_has_nulls, bool replacement_has_nulls>
-CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
-                                            cudf::column_device_view values_to_replace,
-                                            cudf::column_device_view replacement,
-                                            cudf::mutable_column_device_view offsets,
-                                            cudf::mutable_column_device_view indices,
-                                            cudf::bitmask_type* output_valid,
-                                            cudf::size_type* __restrict__ output_valid_count)
-{
-  cudf::size_type nrows = input.size();
-  auto tid              = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-  uint32_t active_mask  = 0xffff'ffffu;
-  active_mask           = __ballot_sync(active_mask, tid < nrows);
-  auto const lane_id{threadIdx.x % cudf::detail::warp_size};
-  uint32_t valid_sum{0};
-
-  while (tid < nrows) {
-    auto const idx      = static_cast<cudf::size_type>(tid);
-    bool input_is_valid = true;
-
-    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(idx);
-    bool output_is_valid = input_is_valid;
-
-    if (input_is_valid) {
-      int result               = get_new_string_value(idx, input, values_to_replace, replacement);
-      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(idx)
-                                                : replacement.element<cudf::string_view>(result);
-      offsets.data<cudf::size_type>()[idx] = output.size_bytes();
-      indices.data<cudf::size_type>()[idx] = result;
-      if (replacement_has_nulls && result != -1) {
-        output_is_valid = replacement.is_valid_nocheck(result);
-      }
-    } else {
-      offsets.data<cudf::size_type>()[idx] = 0;
-      indices.data<cudf::size_type>()[idx] = -1;
-    }
-
-    uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
-    if (0 == lane_id) {
-      output_valid[cudf::word_index(idx)] = bitmask;
-      valid_sum += __popc(bitmask);
-    }
-
-    tid += stride;
-    active_mask = __ballot_sync(active_mask, tid < nrows);
-  }
-
-  // Compute total valid count for this block and add it to global count
-  uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
-  // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) {
-    atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
-  }
-}
-
-/**
- * @brief Kernel which does the second pass of strings replace.
- *
- * It copies the string data needed from input and replacement into the new strings column chars
- * column.
- *
- * @param input The input column
- * @param replacement The replacement values
- * @param offsets The offsets column of the new strings column
- * @param strings The chars column of the new strings column
- * @param indices Temporary column used to store the replacement indices.
- */
-template <bool input_has_nulls, bool replacement_has_nulls>
-CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
-                                             cudf::column_device_view replacement,
-                                             cudf::mutable_column_device_view offsets,
-                                             char* strings,
-                                             cudf::mutable_column_device_view indices)
-{
-  cudf::size_type nrows = input.size();
-  auto tid              = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-
-  while (tid < nrows) {
-    auto const idx         = static_cast<cudf::size_type>(tid);
-    auto const replace_idx = indices.element<cudf::size_type>(idx);
-    bool output_is_valid   = true;
-    bool input_is_valid    = true;
-
-    if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(idx);
-      output_is_valid = input_is_valid;
-    }
-    if (replacement_has_nulls && replace_idx != -1) {
-      output_is_valid = replacement.is_valid_nocheck(replace_idx);
-    }
-    if (output_is_valid) {
-      cudf::string_view output = (replace_idx == -1)
-                                   ? input.element<cudf::string_view>(idx)
-                                   : replacement.element<cudf::string_view>(replace_idx);
-      std::memcpy(
-        strings + offsets.data<cudf::size_type>()[idx], output.data(), output.size_bytes());
-    }
-
-    tid += stride;
-  }
-}
-
 /**
  * @brief Kernel that replaces elements from `output_data` given the following
  *        rule: replace all `values_to_replace[i]` in [values_to_replace_begin`,
@@ -313,7 +180,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const& values_to_replace,
                                            cudf::column_view const& replacement_values,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
@@ -361,7 +228,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for this type");
   }
@@ -373,81 +240,10 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
-  rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
-  cudf::size_type* valid_count = valid_counter.data();
-
-  auto replace_first  = replace_strings_first_pass<true, false>;
-  auto replace_second = replace_strings_second_pass<true, false>;
-  if (input_col.has_nulls()) {
-    if (replacement_values.has_nulls()) {
-      replace_first  = replace_strings_first_pass<true, true>;
-      replace_second = replace_strings_second_pass<true, true>;
-    }
-  } else {
-    if (replacement_values.has_nulls()) {
-      replace_first  = replace_strings_first_pass<false, true>;
-      replace_second = replace_strings_second_pass<false, true>;
-    } else {
-      replace_first  = replace_strings_first_pass<false, false>;
-      replace_second = replace_strings_second_pass<false, false>;
-    }
-  }
-
-  // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              input_col.size(),
-                              cudf::mask_state::UNALLOCATED,
-                              stream);
-  std::unique_ptr<cudf::column> indices =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              input_col.size(),
-                              cudf::mask_state::UNALLOCATED,
-                              stream);
-
-  auto sizes_view   = sizes->mutable_view();
-  auto indices_view = indices->mutable_view();
-
-  auto device_in                = cudf::column_device_view::create(input_col, stream);
-  auto device_values_to_replace = cudf::column_device_view::create(values_to_replace, stream);
-  auto device_replacement       = cudf::column_device_view::create(replacement_values, stream);
-  auto device_sizes             = cudf::mutable_column_device_view::create(sizes_view, stream);
-  auto device_indices           = cudf::mutable_column_device_view::create(indices_view, stream);
-
-  rmm::device_buffer valid_bits =
-    cudf::detail::create_null_mask(input_col.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  // Call first pass kernel to get sizes in offsets
-  cudf::detail::grid_1d grid{input_col.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_values_to_replace,
-    *device_replacement,
-    *device_sizes,
-    *device_indices,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    valid_count);
-
-  auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<cudf::size_type>(), sizes_view.end<cudf::size_type>(), stream, mr);
-  auto offsets_view   = offsets->mutable_view();
-  auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream);
-
-  // Allocate chars array and output null mask
-  cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
-  rmm::device_uvector<char> output_chars(bytes, stream, mr);
-  auto d_chars = output_chars.data();
-
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in, *device_replacement, *device_offsets, d_chars, *device_indices);
-
-  return cudf::make_strings_column(input_col.size(),
-                                   std::move(offsets),
-                                   output_chars.release(),
-                                   null_count,
-                                   std::move(valid_bits));
+  return cudf::strings::detail::find_and_replace_all(
+    input_col, values_to_replace, replacement_values, stream, mr);
 }
 
 template <>
@@ -456,7 +252,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input        = cudf::dictionary_column_view(input_col);
   auto values       = cudf::dictionary_column_view(values_to_replace);
@@ -503,14 +299,15 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
 
-  CUDF_EXPECTS(
-    input_col.type() == values_to_replace.type() && input_col.type() == replacement_values.type(),
-    "Columns type mismatch");
+  CUDF_EXPECTS(cudf::have_same_types(input_col, values_to_replace) &&
+                 cudf::have_same_types(input_col, replacement_values),
+               "Columns type mismatch",
+               cudf::data_type_error);
   CUDF_EXPECTS(not values_to_replace.has_nulls(), "values_to_replace must not have nulls");
 
   if (input_col.is_empty() or values_to_replace.is_empty() or replacement_values.is_empty()) {
@@ -543,7 +340,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::find_and_replace_all(input_col, values_to_replace, replacement_values, stream, mr);
 }
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 6ed28e693fd..1b05a9744fa 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
@@ -56,7 +57,7 @@ struct byte_list_conversion_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      flip_endianness configuration,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return byte_list_conversion_fn<T>::invoke(input, configuration, stream, mr);
   }
@@ -67,7 +68,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<cudf::is_numeric<T>()>> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness configuration,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -124,7 +125,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -162,14 +163,14 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 }  // namespace
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), byte_list_conversion_dispatcher{}, input, endian_configuration, stream, mr);
@@ -178,11 +179,11 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 }  // namespace detail
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 72227ab5dda..580db0e24c5 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -30,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -55,7 +57,7 @@ struct interleave_columns_functor {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return interleave_columns_impl<T>{}(input, create_mask, stream, mr);
   }
@@ -66,7 +68,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::list_
   std::unique_ptr<column> operator()(table_view const& lists_columns,
                                      bool create_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
@@ -77,7 +79,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     // We can safely call `column(0)` as the number of columns is known to be non zero.
     auto const num_children = structs_columns.column(0).num_children();
@@ -139,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   }
 };
 
+struct interleave_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  table_device_view d_table;
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    auto const num_columns    = d_table.num_columns();
+    auto const source_col_idx = idx % num_columns;
+    auto const source_row_idx = idx / num_columns;
+    auto const col            = d_table.column(source_col_idx);
+    if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = col.element<string_view>(source_row_idx);
+    // ensures an empty string is not identified as a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
+  }
+};
+
 template <typename T>
 struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
-                                           bool create_mask,
+                                           bool,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
-    if (num_columns == 1)  // Single strings column returns a copy
+    if (num_columns == 1) {  // Single strings column returns a copy
       return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
+    }
 
     auto strings_count = strings_columns.num_rows();
-    if (strings_count == 0)  // All columns have 0 rows
+    if (strings_count == 0) {  // All columns have 0 rows
       return make_empty_column(type_id::STRING);
+    }
 
     // Create device views from the strings columns.
-    auto table       = table_device_view::create(strings_columns, stream);
-    auto d_table     = *table;
+    auto d_table     = table_device_view::create(strings_columns, stream);
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{};
-    if (create_mask) {
-      // Create resulting null mask
-      valid_mask = cudf::detail::valid_if(
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(num_strings),
-        [num_columns, d_table] __device__(size_type idx) {
-          auto source_row_idx = idx % num_columns;
-          auto source_col_idx = idx / num_columns;
-          return !d_table.column(source_row_idx).is_null(source_col_idx);
-        },
-        stream,
-        mr);
-    }
-
-    auto const null_count = valid_mask.second;
-
-    // Build offsets column by computing sizes of each string in the output
-    auto offsets_transformer =
-      cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
-        // First compute the column and the row this item belongs to
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-        return d_table.column(source_row_idx).is_valid(source_col_idx)
-                 ? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
-                 : 0;
-      });
-    auto offsets_transformer_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-    auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
-    auto d_results_offsets =
-      cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-    // Create the chars column
-    rmm::device_uvector<char> chars(bytes, stream, mr);
-    auto d_results_chars = chars.data();
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      num_strings,
-      [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-
-        // Do not write to buffer if the column value for this row is null
-        if (d_table.column(source_row_idx).is_null(source_col_idx)) return;
-
-        size_type offset = d_results_offsets[idx];
-        char* d_buffer   = d_results_chars + offset;
-        strings::detail::copy_string(
-          d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
-      });
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(num_strings),
+                      indices.begin(),
+                      interleave_strings_fn{*d_table});
 
-    return make_strings_column(num_strings,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(valid_mask.first));
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
@@ -226,7 +196,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -273,7 +243,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype.");
 
@@ -293,7 +263,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::interleave_columns(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 9d76c509333..1c4019b2c73 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -44,7 +45,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(count >= 0, "Count cannot be negative");
 
@@ -62,7 +63,7 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tile(in, count, cudf::get_default_stream(), mr);
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 66104fe5c77..cfedcac8ae4 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -23,10 +23,13 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -94,12 +97,13 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                                                     FollowingIter following,
                                                     size_type row_offset,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
-  CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-               "Defaults column type must match input column.");  // Because LEAD/LAG.
+  CUDF_EXPECTS(cudf::have_same_types(input, default_outputs),
+               "Defaults column type must match input column.",
+               cudf::data_type_error);  // Because LEAD/LAG.
 
   CUDF_EXPECTS(default_outputs.is_empty() || (input.size() == default_outputs.size()),
                "Number of defaults must match input column.");
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index bd3cbb39168..571f4c02cb5 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/bit.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
@@ -150,7 +151,7 @@ std::unique_ptr<column> nth_element(size_type n,
                                     FollowingIter following,
                                     size_type min_periods,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const gather_iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index f1a5c4c78a8..3e085fa963c 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 bool can_optimize_unbounded_window(bool unbounded_preceding,
@@ -94,7 +96,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
                                                          column_view const& input,
                                                          rolling_aggregation const& aggr,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(group_keys.num_columns() > 0,
                "Ungrouped rolling window not supported in aggregation path.");
@@ -127,7 +129,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
 std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
                                                        rolling_aggregation const& aggr,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto const reduce_results = [&] {
     auto const return_dtype = cudf::detail::target_type(input.type(), aggr.kind);
@@ -152,7 +154,7 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return group_keys.num_columns() > 0
            ? aggregation_based_rolling_window(group_keys, input, aggr, stream, mr)
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
index 5964390398c..153586b187f 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.hpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace rmm::mr {
 class device_memory_resource;
@@ -51,6 +52,6 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index af6d6d7f157..c18bb9d9885 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -50,6 +50,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -849,7 +850,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                     int _min_periods,
                                     std::unique_ptr<column>&& _intermediate,
                                     rmm::cuda_stream_view _stream,
-                                    rmm::mr::device_memory_resource* _mr)
+                                    rmm::device_async_resource_ref _mr)
     :
 
       input(_input),
@@ -990,7 +991,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
   std::unique_ptr<column> intermediate;
   std::unique_ptr<column> result;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 };
 
 /**
@@ -1095,7 +1096,7 @@ struct rolling_window_launcher {
              int min_periods,
              [[maybe_unused]] rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto const do_rolling = [&](auto const& device_op) {
       auto output = make_fixed_width_column(
@@ -1164,7 +1165,7 @@ struct rolling_window_launcher {
              int,
              rolling_aggregation const&,
              rmm::cuda_stream_view,
-             rmm::mr::device_memory_resource*)
+             rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Invalid aggregation type/pair");
   }
@@ -1188,7 +1189,7 @@ struct dispatch_rolling {
                                      size_type min_periods,
                                      rolling_aggregation const& agg,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // do any preprocessing of aggregations (eg, MIN -> ARGMIN, COLLECT_LIST -> nothing)
     rolling_aggregation_preprocessor preprocessor;
@@ -1237,7 +1238,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
                                            size_type min_periods,
                                            rolling_aggregation const& agg,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
@@ -1308,7 +1309,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
  *                               FollowingWindowIterator following_window_begin,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1320,7 +1321,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
diff --git a/cpp/src/rolling/detail/rolling.hpp b/cpp/src/rolling/detail/rolling.hpp
index d2dfa2f9df5..2624d982712 100644
--- a/cpp/src/rolling/detail/rolling.hpp
+++ b/cpp/src/rolling/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
@@ -64,7 +66,7 @@ struct rolling_store_output_functor<_T, true> {
  *                               size_type following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -75,7 +77,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rolling_window(column_view const& input,
@@ -83,7 +85,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
  *                               column_view const& following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr);
+ *                               rmm::device_async_resource_ref mr);
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -93,7 +95,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index 85dced0efe3..b259bd51fc4 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -114,7 +115,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 0ce14792cfa..7630898f820 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
@@ -50,7 +51,7 @@ std::unique_ptr<column> create_collect_offsets(size_type input_size,
                                                FollowingIter following_begin,
                                                size_type min_periods,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   // Materialize offsets column.
   auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
@@ -148,7 +149,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
@@ -158,7 +159,7 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
                                              size_type min_periods,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(default_outputs.is_empty(),
                "COLLECT_LIST window function does not support default values.");
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index f51937f7a0e..df0e72748ce 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 
@@ -34,7 +36,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index bb73f305c7b..83e8faec291 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -19,6 +19,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -32,7 +34,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 89a51ad1d87..d461ed7a109 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -29,6 +29,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -44,7 +46,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -61,7 +63,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -80,7 +82,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -205,7 +207,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -304,7 +306,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return detail::grouped_rolling_window(group_keys,
                                         input,
@@ -439,7 +441,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -614,7 +616,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -728,7 +730,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -823,7 +825,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -935,7 +937,7 @@ std::unique_ptr<column> grouped_range_rolling_window_impl(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto [preceding_value, following_value] = [&] {
     if constexpr (std::is_same_v<OrderByT, cudf::string_view>) {
@@ -1024,7 +1026,7 @@ struct dispatch_grouped_range_rolling_window {
              size_type min_periods,
              rolling_aggregation const& aggr,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     return grouped_range_rolling_window_impl<OrderByColumnType>(input,
                                                                 orderby_column,
@@ -1120,7 +1122,7 @@ namespace detail {
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1133,7 +1135,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -1187,7 +1189,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
  *              size_type following_window_in_days,
  *              size_type min_periods,
  *              rolling_aggregation const& aggr,
- *              rmm::mr::device_memory_resource* mr);
+ *              rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1197,7 +1199,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
@@ -1225,7 +1227,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *            window_bounds following_window_in_days,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr);
+ *            rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1235,7 +1237,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   range_window_bounds preceding =
@@ -1265,7 +1267,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  */
 std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_keys,
                                                      column_view const& timestamp_column,
@@ -1275,7 +1277,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      range_window_bounds const& following,
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::grouped_range_rolling_window(group_keys,
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index 5c78cc4382d..a308ed8a7a6 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 // Applies a fixed-size rolling window function to the values in a column, with default output
@@ -30,7 +32,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(input,
@@ -49,7 +51,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto defaults =
@@ -70,7 +72,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 8336e1ef2b0..369ed039b66 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
@@ -213,7 +214,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using Functor = RoundFunctor<T>;
 
@@ -245,7 +246,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using Type                   = device_storage_type_t<T>;
@@ -309,7 +310,7 @@ struct round_type_dispatcher {
     int32_t decimal_places,
     cudf::rounding_method method,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (method) {
@@ -335,7 +336,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               cudf::rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::is_numeric(input.type()) || cudf::is_fixed_point(input.type()),
                "Only integral/floating point/fixed point currently supported.");
@@ -357,7 +358,7 @@ std::unique_ptr<column> round(column_view const& input,
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 2fa008d9062..07425a92413 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -34,14 +35,12 @@ namespace cudf {
 scalar::scalar(data_type type,
                bool is_valid,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _type(type), _is_valid(is_valid, stream, mr)
 {
 }
 
-scalar::scalar(scalar const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+scalar::scalar(scalar const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type(other.type()), _is_valid(other._is_valid, stream, mr)
 {
 }
@@ -62,7 +61,7 @@ bool const* scalar::validity_data() const { return _is_valid.data(); }
 string_scalar::string_scalar(std::string const& string,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(string.data(), string.size(), stream, mr)
 {
@@ -74,7 +73,7 @@ string_scalar::string_scalar(std::string const& string,
 
 string_scalar::string_scalar(string_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(other, stream, mr), _data(other._data, stream, mr)
 {
 }
@@ -82,7 +81,7 @@ string_scalar::string_scalar(string_scalar const& other,
 string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : string_scalar(data.value(stream), is_valid, stream, mr)
 {
 }
@@ -90,7 +89,7 @@ string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
 string_scalar::string_scalar(value_type const& source,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(source.data(), source.size_bytes(), stream, mr)
 {
@@ -99,7 +98,7 @@ string_scalar::string_scalar(value_type const& source,
 string_scalar::string_scalar(rmm::device_buffer&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr), _data(std::move(data))
 {
 }
@@ -130,7 +129,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
     _data{value, stream, mr}
 {
@@ -140,7 +139,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value, stream, mr}
 {
 }
@@ -149,7 +148,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr},
     _data{value.value(), stream, mr}
 {
@@ -160,7 +159,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr}, _data{std::move(data)}
 {
 }
@@ -168,7 +167,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
 template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(fixed_point_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -223,7 +222,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
 {
 }
@@ -232,7 +231,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data{std::move(data)}
 {
 }
@@ -240,7 +239,7 @@ fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(fixed_width_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -313,7 +312,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(T value,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -322,7 +321,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -330,7 +329,7 @@ numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 numeric_scalar<T>::numeric_scalar(numeric_scalar<T> const& other,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -360,7 +359,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(T value,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -369,7 +368,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -377,7 +376,7 @@ chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 chrono_scalar<T>::chrono_scalar(chrono_scalar<T> const& other,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -405,7 +404,7 @@ template <typename T>
 duration_scalar<T>::duration_scalar(rep_type value,
                                     bool is_valid,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{value}, is_valid, stream, mr)
 {
 }
@@ -413,7 +412,7 @@ duration_scalar<T>::duration_scalar(rep_type value,
 template <typename T>
 duration_scalar<T>::duration_scalar(duration_scalar<T> const& other,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
@@ -464,7 +463,7 @@ template <typename D>
 timestamp_scalar<T>::timestamp_scalar(D const& value,
                                       bool is_valid,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
 {
 }
@@ -472,14 +471,14 @@ timestamp_scalar<T>::timestamp_scalar(D const& value,
 template <typename T>
 timestamp_scalar<T>::timestamp_scalar(timestamp_scalar<T> const& other,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
 
 #define TS_CTOR(TimestampType, DurationType)                  \
   template timestamp_scalar<TimestampType>::timestamp_scalar( \
-    DurationType const&, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*);
+    DurationType const&, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref);
 
 /**
  * @brief These are the valid combinations of duration types to timestamp types.
@@ -508,7 +507,7 @@ TS_CTOR(timestamp_ns, int64_t)
 list_scalar::list_scalar(cudf::column_view const& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(data, stream, mr)
 {
 }
@@ -516,14 +515,14 @@ list_scalar::list_scalar(cudf::column_view const& data,
 list_scalar::list_scalar(cudf::column&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(std::move(data))
 {
 }
 
 list_scalar::list_scalar(list_scalar const& other,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -532,7 +531,7 @@ column_view list_scalar::view() const { return _data.view(); }
 
 struct_scalar::struct_scalar(struct_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -540,7 +539,7 @@ struct_scalar::struct_scalar(struct_scalar const& other,
 struct_scalar::struct_scalar(table_view const& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(table{data, stream, mr}, is_valid, stream, mr)}
 {
@@ -550,7 +549,7 @@ struct_scalar::struct_scalar(table_view const& data,
 struct_scalar::struct_scalar(host_span<column_view const> data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{
       init_data(table{table_view{std::vector<column_view>{data.begin(), data.end()}}, stream, mr},
@@ -564,7 +563,7 @@ struct_scalar::struct_scalar(host_span<column_view const> data,
 struct_scalar::struct_scalar(table&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(std::move(data), is_valid, stream, mr)}
 {
@@ -584,7 +583,7 @@ void struct_scalar::assert_valid_size()
 table struct_scalar::init_data(table&& data,
                                bool is_valid,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (is_valid) { return std::move(data); }
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 2336b9075de..d59c5c9fc85 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -31,7 +32,7 @@ struct scalar_construction_helper {
             typename ScalarType                                                = scalar_type_t<T>,
             std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, false, stream, mr);
@@ -42,7 +43,7 @@ struct scalar_construction_helper {
             typename ScalarType                    = scalar_type_t<T>,
             std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr);
@@ -60,7 +61,7 @@ struct scalar_construction_helper {
 // Allocate storage for a single numeric element
 std::unique_ptr<scalar> make_numeric_scalar(data_type type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
 
@@ -70,7 +71,7 @@ std::unique_ptr<scalar> make_numeric_scalar(data_type type,
 // Allocate storage for a single timestamp element
 std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
 
@@ -80,7 +81,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
 // Allocate storage for a single duration element
 std::unique_ptr<scalar> make_duration_scalar(data_type type,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
 
@@ -90,7 +91,7 @@ std::unique_ptr<scalar> make_duration_scalar(data_type type,
 // Allocate storage for a single fixed width element
 std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
 
@@ -99,21 +100,21 @@ std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
 
 std::unique_ptr<scalar> make_list_scalar(column_view elements,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return std::make_unique<list_scalar>(elements, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(table_view const& data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(host_span<column_view const> data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
@@ -124,14 +125,14 @@ struct default_scalar_functor {
 
   template <typename T, std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
   }
 
   template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto const scale_ = numeric::scale_type{type.scale()};
     auto s            = make_fixed_point_scalar<T>(0, scale_, stream, mr);
@@ -142,28 +143,28 @@ struct default_scalar_functor {
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<string_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return std::unique_ptr<scalar>(new string_scalar("", false, stream, mr));
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<dictionary32>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("dictionary type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<list_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("list_view type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("struct_view type not supported");
 }
@@ -172,14 +173,14 @@ std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
 
 std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(type, default_scalar_functor{type}, stream, mr);
 }
 
 std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index b8c7d058535..8f05196a71c 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct contains_column_dispatch {
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result_v = detail::contains(table_view{{haystack}},
                                      table_view{{needles}},
@@ -51,7 +52,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   column_view const& haystack_in,
   column_view const& needles_in,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
@@ -79,7 +80,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
@@ -90,7 +91,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(haystack, needles, stream, mr);
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 0b344ec347b..e88acf68e28 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -62,7 +64,9 @@ struct contains_scalar_dispatch {
                                                            scalar const& needle,
                                                            rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
 
@@ -87,7 +91,9 @@ struct contains_scalar_dispatch {
                                                           scalar const& needle,
                                                           rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
     // In addition, haystack and needle structure compatibility will be checked later on by
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index f7b6d8fdb72..466f9093194 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -22,9 +22,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
@@ -187,7 +189,7 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 3b5dbef0401..328d3f0cee4 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
                                        std::vector<order> const& column_order,
                                        std::vector<null_order> const& null_precedence,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     column_order.empty() or static_cast<std::size_t>(haystack.num_columns()) == column_order.size(),
@@ -121,7 +122,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, true, column_order, null_precedence, stream, mr);
 }
@@ -131,7 +132,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, false, column_order, null_precedence, stream, mr);
 }
@@ -145,7 +146,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::lower_bound(haystack, needles, column_order, null_precedence, stream, mr);
@@ -156,7 +157,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::upper_bound(haystack, needles, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index cbd0207c20e..c5dcc7c240d 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/type_traits>
@@ -270,7 +271,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   data_type const output_type         = (percentage or method == rank_method::AVERAGE)
                                           ? data_type(type_id::FLOAT64)
@@ -373,7 +374,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rank(
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index d9457341bd2..408ac29b8a9 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::UNSTABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -69,7 +70,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::UNSTABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -82,7 +83,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
@@ -95,7 +96,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 796e178fecd..6d472925b30 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -160,7 +161,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
                                                     column_view const& segment_offsets,
                                                     order const& column_order,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
@@ -227,7 +228,7 @@ std::unique_ptr<column> segmented_sorted_order_common(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (keys.num_rows() == 0 || keys.num_columns() == 0) {
     return cudf::make_empty_column(type_to_id<size_type>());
@@ -304,7 +305,7 @@ std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index adffc06ab93..7216bc99e08 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/sort.h>
@@ -36,7 +37,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::UNSTABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -46,7 +47,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -66,7 +67,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   // fast-path sort conditions: single, non-floating-point, fixed-width column with no nulls
   if (inplace_column_sort_fn<sort_method::UNSTABLE>::is_usable(input)) {
@@ -88,7 +89,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sorted_order(input, column_order, null_precedence, stream, mr);
@@ -98,7 +99,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort(input, column_order, null_precedence, stream, mr);
@@ -109,7 +110,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 7db44476988..99a45bf91a3 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::UNSTABLE>(column_view const& input,
                                                             order column_order,
                                                             null_order null_precedence,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
index 7af24f22b67..564791e0b49 100644
--- a/cpp/src/sort/sort_column_impl.cuh
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
@@ -52,7 +53,7 @@ std::unique_ptr<column> sorted_order(column_view const& input,
                                      order column_order,
                                      null_order null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Comparator functor needed for single column sort.
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index e0331d65053..20e977e9fd5 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -21,12 +21,15 @@
 
 #include <cudf/column/column_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
 /**
  * @copydoc
- * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::mr::device_memory_resource*)
+ * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::device_async_resource_ref
+ * )
  *
  * @tparam stable Whether to use stable sort
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -36,7 +39,7 @@ std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return cudf::make_numeric_column(
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 4725d65e05d..61e37205c98 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -30,7 +32,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::STABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -42,7 +44,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::STABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -56,7 +58,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
@@ -69,7 +71,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 0bfe2cfef16..ce05a755756 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::STABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -42,7 +43,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (inplace_column_sort_fn<sort_method::STABLE>::is_usable(input)) {
     auto output = std::make_unique<column>(input.column(0), stream, mr);
@@ -62,7 +63,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -83,7 +84,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
@@ -93,7 +94,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort(input, column_order, null_precedence, stream, mr);
@@ -104,7 +105,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index 25a6c92034a..bdb631a8154 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::STABLE>(column_view const& input,
                                                           order column_order,
                                                           null_order null_precedence,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 8f707f6d15d..cdca9517d94 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 
@@ -65,7 +66,7 @@ namespace detail {
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (boolean_mask.is_empty()) { return empty_like(input); }
 
@@ -90,7 +91,7 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 11e2e77c253..a6f15cc49ec 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -44,7 +45,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
@@ -145,7 +146,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -172,7 +173,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(
@@ -184,7 +185,7 @@ std::unique_ptr<column> distinct_indices(table_view const& input,
                                          null_equality nulls_equal,
                                          nan_equality nans_equal,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto indices = detail::distinct_indices(input, keep, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 8f36ec98f4a..13e89b15bb7 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/hash_reduce_by_row.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 namespace {
@@ -88,7 +90,7 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
                "This function should not be called with KEEP_ANY");
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index b667d0b04f0..40f97e00ce5 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
@@ -82,6 +83,6 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index a645b46f7a7..b46381c8ff6 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0) {
@@ -116,7 +117,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -126,7 +127,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 6ea1fd4c31f..cb7cd61bf02 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -68,7 +69,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0 || not cudf::has_nulls(keys_view)) {
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -99,7 +100,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 63167b45b2d..27b5a92ab69 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
@@ -34,7 +36,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -77,7 +79,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_distinct(
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index db67daaa324..c1f8b17938c 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -52,7 +53,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   // If keep is KEEP_ANY, just alias it to KEEP_FIRST.
   if (keep == duplicate_keep_option::KEEP_ANY) { keep = duplicate_keep_option::KEEP_FIRST; }
@@ -119,7 +120,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 073ed74d8c9..778f546990d 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda/functional>
@@ -75,7 +76,7 @@ template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   UnaryFunction& ufn,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results   = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -136,7 +137,7 @@ CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings
 
 std::unique_ptr<column> count_characters_parallel(strings_column_view const& input,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -165,7 +166,7 @@ std::unique_ptr<column> count_characters_parallel(strings_column_view const& inp
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if ((input.size() == input.null_count()) ||
       ((input.chars_size(stream) / (input.size() - input.null_count())) <
@@ -180,7 +181,7 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto ufn = cuda::proclaim_return_type<size_type>(
     [] __device__(string_view const& d_str) { return d_str.size_bytes(); });
@@ -219,7 +220,7 @@ namespace detail {
 //
 std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -263,21 +264,21 @@ std::unique_ptr<column> code_points(strings_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_characters(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_bytes(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::code_points(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 3889bd31b4d..3f7a98381b8 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -63,8 +64,9 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column)
     : d_flags(get_character_flags_table()),
@@ -107,7 +109,7 @@ struct base_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -136,7 +138,7 @@ struct base_fn {
       // capitalize the next char if this one is a delimiter
       capitalize = derived.capitalize_next(chr, flag);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -227,10 +229,9 @@ template <typename CapitalFn>
 std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
@@ -244,7 +245,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiters,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
   if (input.is_empty()) return make_empty_column(type_id::STRING);
@@ -256,7 +257,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   auto d_column = column_device_view::create(input.parent(), stream);
@@ -265,7 +266,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::BOOL8);
   auto results  = make_numeric_column(data_type{type_id::BOOL8},
@@ -289,7 +290,7 @@ std::unique_ptr<column> is_title(strings_column_view const& input,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiter,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::capitalize(input, delimiter, stream, mr);
@@ -298,7 +299,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::title(input, sequence_type, stream, mr);
@@ -306,7 +307,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_title(input, stream, mr);
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 8d8930013cf..c1688d20791 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
@@ -31,9 +32,14 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
+#include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/merge.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -110,71 +116,145 @@ struct convert_char_fn {
  *
  * This can be used in calls to make_strings_children.
  */
-struct upper_lower_fn {
+struct base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
-  __device__ void operator()(size_type idx) const
+  base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
+
+  __device__ inline void process_string(string_view d_str, size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str = d_strings.element<string_view>(idx);
-    size_type bytes  = 0;
-    char* d_buffer   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const size = converter.process_character(*itr, d_buffer);
+    size_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) {
+      if (is_utf8_continuation_char(static_cast<u_char>(*itr))) continue;
+      char_utf8 chr = 0;
+      to_char_utf8(itr, chr);
+      auto const size = converter.process_character(chr, d_buffer);
       if (d_buffer) {
         d_buffer += size;
       } else {
         bytes += size;
       }
     }
-    if (!d_buffer) { d_offsets[idx] = bytes; }
+    if (!d_buffer) { d_sizes[idx] = bytes; }
   }
 };
 
+struct upper_lower_fn : public base_upper_lower_fn {
+  column_device_view d_strings;
+
+  upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings)
+    : base_upper_lower_fn{converter}, d_strings{d_strings}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_sizes[idx] = 0; }
+      return;
+    }
+    auto const d_str = d_strings.element<string_view>(idx);
+    process_string(d_str, idx);
+  }
+};
+
+// Long strings are divided into smaller strings using this value as a guide.
+// Generally strings are split into sub-blocks of bytes of this size but
+// care is taken to not sub-block in the middle of a multi-byte character.
+constexpr size_type LS_SUB_BLOCK_SIZE = 32;
+
 /**
- * @brief Count output bytes in warp-parallel threads
+ * @brief Produces sub-offsets for the chars in the given strings column
+ */
+struct sub_offset_fn {
+  char const* d_input_chars;
+  int64_t first_offset;
+  int64_t last_offset;
+
+  __device__ int64_t operator()(int64_t idx) const
+  {
+    auto const end = d_input_chars + last_offset;
+    auto position  = (idx + 1) * LS_SUB_BLOCK_SIZE;
+    auto begin     = d_input_chars + first_offset + position;
+    while ((begin < end) && is_utf8_continuation_char(static_cast<u_char>(*begin))) {
+      ++begin;
+      ++position;
+    }
+    return (begin < end) ? position + first_offset : last_offset;
+  }
+};
+
+/**
+ * @brief Specialized case conversion for long strings
  *
- * This executes as one warp per string and just computes the output sizes.
+ * This is needed since the offset count can exceed size_type.
+ * Also, nulls are ignored since this purely builds the output chars.
+ * The d_offsets are only temporary to help address the sub-blocks.
  */
-struct count_bytes_fn {
+struct upper_lower_ls_fn : public base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
-  size_type* d_offsets;
+  char const* d_input_chars;
+  int64_t* d_input_offsets;  // includes column offset
 
+  upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets)
+    : base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets}
+  {
+  }
+
+  // idx is row index
   __device__ void operator()(size_type idx) const
   {
-    auto const str_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+    auto const offset = d_input_offsets[idx];
+    auto const d_str  = string_view{d_input_chars + offset,
+                                   static_cast<size_type>(d_input_offsets[idx + 1] - offset)};
+    process_string(d_str, idx);
+  }
+};
+
+/**
+ * @brief Count output bytes in warp-parallel threads
+ *
+ * This executes as one warp per string and just computes the output sizes.
+ */
+CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
+                                    column_device_view d_strings,
+                                    size_type* d_sizes)
+{
+  auto idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
 
-    // initialize the output for the atomicAdd
-    if (lane_idx == 0) { d_offsets[str_idx] = 0; }
-    __syncwarp();
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
 
-    if (d_strings.is_null(str_idx)) { return; }
-    auto const d_str   = d_strings.element<string_view>(str_idx);
-    auto const str_ptr = d_str.data();
+  // initialize the output for the atomicAdd
+  if (lane_idx == 0) { d_sizes[str_idx] = 0; }
+  __syncwarp();
 
-    size_type size = 0;
-    for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-      auto const chr = str_ptr[i];
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str   = d_strings.element<string_view>(str_idx);
+  auto const str_ptr = d_str.data();
+
+  // each thread processes 4 bytes
+  size_type size = 0;
+  for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) {
+    for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) {
+      auto const chr = str_ptr[j];
       if (is_utf8_continuation_char(chr)) { continue; }
       char_utf8 u8 = 0;
-      to_char_utf8(str_ptr + i, u8);
+      to_char_utf8(str_ptr + j, u8);
       size += converter.process_character(u8);
     }
-    // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) {
-      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
-      ref.fetch_add(size, cuda::std::memory_order_relaxed);
-    }
   }
-};
+  // this is slightly faster than using the cub::warp_reduce
+  if (size > 0) {
+    cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
+    ref.fetch_add(size, cuda::std::memory_order_relaxed);
+  }
+}
 
 /**
  * @brief Special functor for processing ASCII-only data
@@ -184,6 +264,41 @@ struct ascii_converter_fn {
   __device__ char operator()(char chr) { return converter.process_ascii(chr); }
 };
 
+constexpr int64_t block_size       = 512;
+constexpr int64_t bytes_per_thread = 8;
+
+/**
+ * @brief Checks the chars data for any multibyte characters
+ *
+ * The output count is not accurate but it is only checked for > 0.
+ */
+CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
+                                       int64_t first_offset,
+                                       int64_t last_offset,
+                                       int64_t* d_output)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  // read only every 2nd byte; all bytes in a multibyte char have high bit set
+  auto const byte_idx = (static_cast<int64_t>(idx) * bytes_per_thread) + first_offset;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  // each thread processes 8 bytes (only 4 need to be checked)
+  int64_t mb_count = 0;
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) {
+    u_char const chr = static_cast<u_char>(d_input_chars[i]);
+    mb_count += ((chr & 0x80) > 0);
+  }
+  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
+
+  if ((lane_idx == 0) && (mb_total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Utility method for converting upper and lower case characters
  * in a strings column
@@ -197,7 +312,7 @@ struct ascii_converter_fn {
 std::unique_ptr<column> convert_case(strings_column_view const& input,
                                      character_flags_table_type case_flag,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.size() == input.null_count()) {
     return std::make_unique<column>(input.parent(), stream, mr);
@@ -208,13 +323,20 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   auto const d_cases   = get_character_cases_table();
   auto const d_special = get_special_case_mapping_table();
 
+  auto const first_offset = (input.offset() == 0) ? 0L
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size  = last_offset - first_offset;
+  auto const input_chars = input.chars_begin(stream);
+
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
 
   // For smaller strings, use the regular string-parallel algorithm
-  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
+  if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    auto [offsets, chars] = make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -224,20 +346,18 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // Check if the input contains any multi-byte characters.
   // This check incurs ~20% performance hit for smaller strings and so we only use it
-  // after the threshold check above. The check makes very little impact for larger strings
+  // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
-  bool const multi_byte_chars =
-    thrust::count_if(rmm::exec_policy(stream),
-                     input.chars_begin(stream),
-                     input.chars_end(stream),
-                     cuda::proclaim_return_type<bool>(
-                       [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
-  if (!multi_byte_chars) {
+  rmm::device_scalar<int64_t> mb_count(0, stream);
+  // cudf::detail::grid_1d is limited to size_type elements
+  auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
+  // we only need to check every other byte since either will contain high bit
+  has_multibytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    input_chars, first_offset, last_offset, mb_count.data());
+  if (mb_count.value(stream) == 0) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
-    auto result           = std::make_unique<column>(input.parent(), stream, mr);
-    auto d_chars          = result->mutable_view().head<char>();
-    auto const chars_size = strings_column_view(result->view()).chars_size(stream);
+    auto result  = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars = result->mutable_view().head<char>();
     thrust::transform(
       rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
     result->set_null_count(input.null_count());
@@ -245,30 +365,46 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   }
 
   // This will use a warp-parallel algorithm to compute the output sizes for each string
-  // and then uses the normal string parallel functor to build the output.
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<size_type>();
-
-  // first pass, compute output sizes
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
-  count_bytes_fn counter{ccfn, *d_strings, d_offsets};
-  auto const count_itr = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each_n(
-    rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);
-
-  // convert sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  // second pass, write output
-  converter.d_offsets = d_offsets;
-  converter.d_chars   = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
+  auto [offsets, bytes] = [&] {
+    rmm::device_uvector<size_type> sizes(input.size(), stream);
+    // cudf::detail::grid_1d is limited to size_type threads
+    auto const num_blocks = util::div_rounding_up_safe(
+      static_cast<int64_t>(input.size()) * cudf::detail::warp_size, block_size);
+    count_bytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      ccfn, *d_strings, sizes.data());
+    // convert sizes to offsets
+    return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
+  }();
+
+  // build sub-offsets
+  auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets     = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  {
+    rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      count_itr,
+                      count_itr + sub_count,
+                      sub_offsets.data(),
+                      sub_offset_fn{input_chars, first_offset, last_offset});
+
+    // merge them with input offsets
+    auto input_offsets =
+      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
+                  sub_offsets.begin(),
+                  sub_offsets.end(),
+                  tmp_offsets.begin());
+    stream.synchronize();  // protect against destruction of sub_offsets
+  }
+
+  // run case conversion over the new sub-strings
+  auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
+  upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
+  auto chars = std::get<1>(make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
@@ -281,7 +417,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -290,7 +426,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 //
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -299,7 +435,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 //
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
@@ -312,7 +448,7 @@ std::unique_ptr<column> swapcase(strings_column_view const& strings,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_lower(strings, stream, mr);
@@ -320,7 +456,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_upper(strings, stream, mr);
@@ -328,7 +464,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::swapcase(strings, stream, mr);
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index b8c0dfd27e6..58137aced0f 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -87,7 +88,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -129,8 +130,9 @@ struct filter_chars_fn {
   string_character_types const types_to_remove;
   string_character_types const types_to_keep;
   string_view const d_replacement;  ///< optional replacement for removed characters
-  int32_t* d_offsets{};             ///< size of the output string stored here during first pass
-  char* d_chars{};                  ///< this is null only during the first pass
+  size_type* d_sizes{};
+  char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Returns true if the given character should be replaced.
@@ -149,7 +151,7 @@ struct filter_chars_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.element<string_view>(idx);
@@ -164,7 +166,7 @@ struct filter_chars_fn {
       nbytes += d_newchar.size_bytes() - char_size;
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -175,7 +177,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   if (types_to_remove == ALL_TYPES)
@@ -200,8 +202,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
@@ -219,7 +220,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::all_characters_of_type(input, types, verify_types, stream, mr);
@@ -230,7 +231,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& inp
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 14f530971f5..a2c77c5e77f 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
@@ -32,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -48,8 +50,9 @@ struct concat_strings_base {
   table_device_view const d_table;
   string_scalar_device_view const d_narep;
   separator_on_nulls separate_nulls;
-  size_type* d_offsets{};
-  char* d_chars{};
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Concatenate each table row to a single output string.
@@ -67,7 +70,7 @@ struct concat_strings_base {
         thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
           return col.is_null(idx);
         })) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -94,7 +97,7 @@ struct concat_strings_base {
         write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -112,7 +115,7 @@ struct concat_strings_fn : concat_strings_base {
   {
   }
 
-  __device__ void operator()(size_type idx) { process_row(idx, d_separator); }
+  __device__ void operator()(std::size_t idx) { process_row(idx, d_separator); }
 };
 
 }  // namespace
@@ -122,7 +125,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified");
@@ -187,7 +190,7 @@ struct multi_separator_concat_fn : concat_strings_base {
   __device__ void operator()(size_type idx)
   {
     if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -206,7 +209,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
@@ -262,7 +265,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
@@ -274,7 +277,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c6290ceb6c2..c4cc0dbe09d 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -32,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -83,8 +85,9 @@ struct join_base_fn {
  * This functor is suitable for make_strings_children
  */
 struct join_fn : public join_base_fn {
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   join_fn(column_device_view const d_strings,
           string_view d_separator,
@@ -105,7 +108,7 @@ struct join_fn : public join_base_fn {
     } else {
       bytes += d_str.size_bytes() + d_sep.size_bytes();
     }
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -131,7 +134,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -159,16 +162,16 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     return std::move(*chars_data);
   }();
 
+  // API returns a single output row which cannot exceed row limit(max of size_type).
+  CUDF_EXPECTS(chars.size() < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+               "The output exceeds the row size limit",
+               std::overflow_error);
+
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    if (chars.size() < static_cast<std::size_t>(get_offset64_threshold())) {
-      auto offsets32 = cudf::detail::make_device_uvector_async(
-        std::vector<int32_t>({0, static_cast<int32_t>(chars.size())}), stream, mr);
-      return std::make_unique<column>(std::move(offsets32), rmm::device_buffer{}, 0);
-    }
-    auto offsets64 = cudf::detail::make_device_uvector_async(
-      std::vector<int64_t>({0L, static_cast<int64_t>(chars.size())}), stream, mr);
-    return std::make_unique<column>(std::move(offsets64), rmm::device_buffer{}, 0);
+    auto offsets = cudf::detail::make_device_uvector_async(
+      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
   // build the null mask: only one output row so it is either all-valid or all-null
@@ -191,7 +194,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_strings(strings, separator, narep, stream, mr);
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 170e621e05c..f5dfc1a2012 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -59,11 +60,12 @@ struct compute_size_and_concatenate_fn {
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  size_type* d_offsets{nullptr};
+  size_type* d_sizes{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
+  cudf::detail::input_offsetalator d_offsets;
 
   [[nodiscard]] __device__ bool output_is_null(size_type const idx,
                                                size_type const start_idx,
@@ -83,7 +85,7 @@ struct compute_size_and_concatenate_fn {
     auto const end_idx   = list_offsets[idx + 1];
 
     if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
-      d_offsets[idx] = 0;
+      d_sizes[idx] = 0;
       return;
     }
 
@@ -119,7 +121,7 @@ struct compute_size_and_concatenate_fn {
 
     // If there are all null elements, the output should be the same as having an empty list input:
     // a null or an empty string
-    if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
+    if (!d_chars) { d_sizes[idx] = has_valid_element ? size_bytes : 0; }
   }
 };
 
@@ -178,7 +180,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -251,7 +253,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -302,7 +304,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(
@@ -316,7 +318,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(lists_strings_column,
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 3f0ebc5962b..718ac41e36c 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
                                       regex_program const& prog,
                                       bool const beginning_only,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -88,7 +89,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, false, stream, mr);
 }
@@ -96,7 +97,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, true, stream, mr);
 }
@@ -104,7 +105,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -126,7 +127,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_re(input, prog, stream, mr);
@@ -135,7 +136,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::matches_re(input, prog, stream, mr);
@@ -144,7 +145,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_re(input, prog, stream, mr);
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index d1de345a709..d4ccb685061 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -16,7 +16,6 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
@@ -25,13 +24,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -42,7 +39,7 @@ namespace detail {
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -85,7 +82,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& input,
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_booleans(input, true_string, stream, mr);
@@ -98,13 +95,14 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -112,7 +110,7 @@ struct from_booleans_fn {
       auto const result = d_column.element<bool>(idx) ? d_true : d_false;
       memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes());
     } else {
-      d_offsets[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
+      d_sizes[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
     }
   };
 };
@@ -123,7 +121,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = booleans.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -160,7 +158,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_booleans(booleans, true_string, false_string, stream, mr);
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index f54eb082959..2f4ebf97264 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -34,9 +34,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
@@ -437,7 +437,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty())
     return make_empty_column(timestamp_type);  // make_timestamp_column(timestamp_type, 0);
@@ -675,7 +675,7 @@ struct check_datetime_format {
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view const& format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -711,7 +711,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_timestamps(input, timestamp_type, format, stream, mr);
@@ -720,7 +720,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_timestamp(input, format, stream, mr);
@@ -755,8 +755,9 @@ struct datetime_formatter_fn {
   column_device_view const d_timestamps;
   column_device_view const d_format_names;
   device_span<format_item const> const d_format_items;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Specialized modulo expression that handles negative values.
@@ -1086,14 +1087,14 @@ struct datetime_formatter_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_timestamps.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const tstamp = d_timestamps.element<T>(idx);
     if (d_chars) {
       timestamp_to_string(tstamp, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_output_size(tstamp);
+      d_sizes[idx] = compute_output_size(tstamp);
     }
   }
 };
@@ -1106,7 +1107,7 @@ struct dispatch_from_timestamps_fn {
                               column_device_view const& d_format_names,
                               device_span<format_item const> d_format_items,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
   {
     return make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
@@ -1129,7 +1130,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (timestamps.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -1171,7 +1172,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_timestamps(timestamps, format, names, stream, mr);
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 8076c5c484b..2e4a776d3c0 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -23,12 +23,11 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
@@ -191,8 +190,9 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
   {
@@ -377,14 +377,14 @@ struct from_durations_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_durations.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
     if (d_chars != nullptr) {
       set_chars(idx);
     } else {
-      d_offsets[idx] = string_size(d_durations.template element<T>(idx));
+      d_sizes[idx] = string_size(d_durations.template element<T>(idx));
     }
   }
 };
@@ -400,7 +400,7 @@ struct dispatch_from_durations_fn {
   std::unique_ptr<column> operator()(column_view const& durations,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
@@ -681,7 +681,7 @@ struct dispatch_to_durations_fn {
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -694,7 +694,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -724,7 +724,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_durations(durations, format, stream, mr);
@@ -734,7 +734,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_durations(input, duration_type, format, stream, mr);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index fb8ebf55ef1..73089ad407e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -31,15 +31,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -133,7 +131,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -162,7 +160,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Output for to_fixed_point must be a decimal type.");
   }
@@ -174,7 +172,7 @@ struct dispatch_to_fixed_point_fn {
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(output_type);
   return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, output_type, stream, mr);
@@ -186,7 +184,7 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_fixed_point(input, output_type, stream, mr);
@@ -197,8 +195,9 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts a decimal element into a string.
@@ -218,13 +217,13 @@ struct from_fixed_point_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] =
+      d_sizes[idx] =
         fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
@@ -237,7 +236,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;  // underlying value type
 
@@ -256,7 +255,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_fixed_point function must be a decimal type.");
   }
@@ -266,7 +265,7 @@ struct dispatch_from_fixed_point_fn {
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr);
@@ -278,7 +277,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_fixed_point(input, stream, mr);
@@ -292,7 +291,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type decimal_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -321,7 +320,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_fixed_point is expecting a decimal type");
   }
@@ -332,7 +331,7 @@ struct dispatch_is_fixed_point_fn {
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return cudf::make_empty_column(type_id::BOOL8);
   return type_dispatcher(
@@ -343,7 +342,7 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_fixed_point(input, decimal_type, stream, mr);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index df019ca236a..bd7b411d3c3 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -30,10 +30,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <cmath>
@@ -94,7 +93,7 @@ struct dispatch_to_floats_fn {
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -123,7 +122,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& input,
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_floats(input, output_type, stream, mr);
@@ -355,8 +354,9 @@ struct ftos_converter {
 template <typename FloatType>
 struct from_floats_fn {
   column_device_view d_floats;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ size_type compute_output_size(FloatType value)
   {
@@ -374,13 +374,13 @@ struct from_floats_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -394,7 +394,7 @@ struct dispatch_from_floats_fn {
   template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& floats,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = floats.size();
     auto column             = column_device_view::create(floats, stream);
@@ -417,7 +417,7 @@ struct dispatch_from_floats_fn {
   template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_floats function must be a float type.");
   }
@@ -428,7 +428,7 @@ struct dispatch_from_floats_fn {
 // This will convert all float column types into a strings column.
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = floats.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -441,7 +441,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 // external API
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, stream, mr);
@@ -450,7 +450,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 namespace detail {
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -480,7 +480,7 @@ std::unique_ptr<column> is_float(strings_column_view const& input,
 // external API
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_float(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 332bc9837c1..a34b148a951 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -122,8 +123,9 @@ struct dispatch_hex_to_integers_fn {
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
   {
@@ -140,7 +142,7 @@ struct integer_to_hex_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -166,7 +168,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
+      d_sizes[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
@@ -176,12 +178,12 @@ struct dispatch_integers_to_hex_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
+    auto [offsets_column, chars] =
+      make_strings_children(integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets_column),
@@ -204,7 +206,7 @@ struct dispatch_integers_to_hex_fn {
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
@@ -226,7 +228,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -264,7 +266,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::STRING); }
   return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
@@ -276,7 +278,7 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hex_to_integers(strings, output_type, stream, mr);
@@ -284,7 +286,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_hex(strings, stream, mr);
@@ -292,7 +294,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_hex(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index eb2e9c28134..aeabc71d300 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -31,11 +31,10 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
@@ -114,7 +113,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input.parent(), stream);
     auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -148,7 +147,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_integer is expecting an integer type");
   }
@@ -158,7 +157,7 @@ struct dispatch_is_integer_fn {
 
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto const d_column = column_device_view::create(input.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -193,7 +192,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr);
@@ -204,7 +203,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 // external APIs
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, stream, mr);
@@ -213,7 +212,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, int_type, stream, mr);
@@ -271,7 +270,7 @@ struct dispatch_to_integers_fn {
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -302,7 +301,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& input,
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_integers(input, output_type, stream, mr);
@@ -313,8 +312,9 @@ namespace {
 template <typename IntegerType>
 struct from_integers_fn {
   column_device_view d_integers;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts an integer element into a string.
@@ -333,13 +333,13 @@ struct from_integers_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_integers.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       integer_element_to_string(idx);
     } else {
-      d_offsets[idx] = count_digits(d_integers.element<IntegerType>(idx));
+      d_sizes[idx] = count_digits(d_integers.element<IntegerType>(idx));
     }
   }
 };
@@ -353,7 +353,7 @@ struct dispatch_from_integers_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = integers.size();
     auto column             = column_device_view::create(integers, stream);
@@ -376,7 +376,7 @@ struct dispatch_from_integers_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_integers function must be an integer type.");
   }
@@ -386,7 +386,7 @@ struct dispatch_from_integers_fn {
 // This will convert all integer column types into a strings column.
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = integers.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -399,7 +399,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 // external API
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_integers(integers, stream, mr);
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index ce7f98067ef..68a24e000ae 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -74,7 +75,7 @@ struct ipv4_to_integers_fn {
 // Convert strings column of IPv4 addresses to integers column
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -106,7 +107,7 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 // external API
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ipv4_to_integers(input, stream, mr);
@@ -123,13 +124,14 @@ namespace {
  */
 struct integers_to_ipv4_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -150,7 +152,7 @@ struct integers_to_ipv4_fn {
       shift_bits -= 8;
     }
 
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -159,15 +161,15 @@ struct integers_to_ipv4_fn {
 // Convert integers into IPv4 addresses
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
-  auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
+  auto d_column = column_device_view::create(integers, stream);
+  auto [offsets_column, chars] =
+    make_strings_children(integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
                              std::move(offsets_column),
@@ -178,7 +180,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -227,7 +229,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& input,
 
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_ipv4(integers, stream, mr);
@@ -235,7 +237,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_ipv4(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index d6c24b6981b..604f928430b 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -65,8 +66,9 @@ struct format_lists_fn {
   string_view const d_na_rep;
   stack_item* d_stack;
   size_type const max_depth;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ column_device_view get_nested_child(size_type idx)
   {
@@ -183,7 +185,7 @@ struct format_lists_fn {
       }
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -193,7 +195,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
@@ -216,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
@@ -234,7 +236,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::format_list_column(input, na_rep, separators, stream, mr);
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index f5aeeb8d130..39907a38f2f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 
@@ -49,8 +50,9 @@ namespace {
 //
 struct url_encoder_fn {
   column_device_view const d_strings;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   // utility to create 2-byte hex characters from single binary byte
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -79,7 +81,7 @@ struct url_encoder_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -116,7 +118,7 @@ struct url_encoder_fn {
         }
       }
     }
-    if (!d_chars) d_offsets[idx] = nbytes;
+    if (!d_chars) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -125,14 +127,14 @@ struct url_encoder_fn {
 //
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    url_encoder_fn{*d_column}, input.size(), stream, mr);
+  auto [offsets_column, chars] =
+    make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
@@ -146,7 +148,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 // external API
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_encode(input, stream, mr);
@@ -200,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
@@ -285,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
   __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
@@ -369,7 +373,7 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
 //
 std::unique_ptr<column> url_decode(strings_column_view const& strings,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -416,7 +420,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
 std::unique_ptr<column> url_decode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_decode(input, stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index c4564b1105b..7622e39e735 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -203,7 +204,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Compute output sizes
@@ -220,9 +221,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total number of strings exceeds the column size limit",
                std::overflow_error);
-  CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of strings exceeds the column size limit",
-               std::overflow_error);
 
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
@@ -232,8 +230,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   auto d_new_chars = output_chars.data();
 
   // create output offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
   auto itr_new_offsets =
     cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
 
@@ -268,15 +265,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     // Use a heuristic to guess when the fused kernel will be faster than memcpy
     if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) {
       // Use single kernel launch to copy chars columns
-      constexpr size_type block_size{256};
-      cudf::detail::grid_1d config(total_bytes, block_size);
-      auto const kernel = fused_concatenate_string_chars_kernel;
-      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        d_views,
-        d_partition_offsets.data(),
-        static_cast<size_type>(columns.size()),
-        total_bytes,
-        d_new_chars);
+      constexpr size_t block_size{256};
+      // cudf::detail::grid_1d limited to size_type elements
+      auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size);
+      auto const kernel     = fused_concatenate_string_chars_kernel;
+      kernel<<<num_blocks, block_size, 0, stream.value()>>>(d_views,
+                                                            d_partition_offsets.data(),
+                                                            static_cast<size_type>(columns.size()),
+                                                            total_bytes,
+                                                            d_new_chars);
     } else {
       // Memcpy each input chars column (more efficient for very large strings)
       for (auto column = columns.begin(); column != columns.end(); ++column) {
diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index f4c86389534..9f8c47602f8 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -65,7 +66,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto target_end = target_begin + (source_end - source_begin);
   CUDF_EXPECTS(
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 6f045fa7ea8..e8b411d50a6 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& input,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 3a83cdab045..5bba4855390 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -19,12 +19,13 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -92,7 +93,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto d_fill_str = static_cast<string_scalar const&>(fill_value).value(stream);
 
@@ -103,8 +104,8 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   auto const d_input = column_device_view::create(input.parent(), stream);
   auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
     0, output_sizes_fn{*d_input, d_fill_str, offset});
-  auto [offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + input.size(), stream, mr);
   auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 8a32a46cc2b..e8672ea5335 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -60,7 +62,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   assert(output_size >= d_strings.size() and "Unexpected output size");
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index a4f76c1c5e3..4a5efac37fd 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -45,7 +46,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index ffd4e03ea87..b18b50d1b43 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -91,7 +92,7 @@ struct extract_fn {
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -135,7 +136,7 @@ std::unique_ptr<table> extract(strings_column_view const& input,
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract(input, prog, stream, mr);
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 3a02acb7050..27691068d5a 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/transform_scan.h>
@@ -104,7 +105,7 @@ struct extract_fn {
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -164,7 +165,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_all_record(input, prog, stream, mr);
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 685c3eec744..878d0fe11ba 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -15,46 +15,44 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/fill.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
 struct fill_fn {
   column_device_view const d_strings;
   size_type const begin;
   size_type const end;
-  string_view const d_value;
-  size_type* d_offsets{};
-  char* d_chars{};
-
-  __device__ string_view resolve_string_at(size_type idx) const
-  {
-    if ((begin <= idx) && (idx < end)) { return d_value; }
-    return d_strings.is_valid(idx) ? d_strings.element<string_view>(idx) : string_view{};
-  }
+  string_scalar_device_view const d_value;
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    auto const d_str = resolve_string_at(idx);
-    if (!d_chars) {
-      d_offsets[idx] = d_str.size_bytes();
+    auto d_str = string_view();
+    if ((begin <= idx) && (idx < end)) {
+      if (!d_value.is_valid()) { return string_index_pair{nullptr, 0}; }
+      d_str = d_value.value();
     } else {
-      copy_string(d_chars + d_offsets[idx], d_str);
+      if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+      d_str = d_strings.element<string_view>(idx);
     }
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
+
 }  // namespace
 
 std::unique_ptr<column> fill(strings_column_view const& input,
@@ -62,7 +60,7 @@ std::unique_ptr<column> fill(strings_column_view const& input,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -71,33 +69,18 @@ std::unique_ptr<column> fill(strings_column_view const& input,
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
   if (begin == end) { return std::make_unique<column>(input.parent(), stream, mr); }
 
-  auto strings_column  = column_device_view::create(input.parent(), stream);
-  auto const d_strings = *strings_column;
-  auto const is_valid  = value.is_valid(stream);
-
-  // create resulting null mask
-  auto [null_mask, null_count] = [begin, end, is_valid, d_strings, stream, mr] {
-    if (begin == 0 and end == d_strings.size() and is_valid) {
-      return std::pair(rmm::device_buffer{}, 0);
-    }
-    return cudf::detail::valid_if(
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(d_strings.size()),
-      [d_strings, begin, end, is_valid] __device__(size_type idx) {
-        return ((begin <= idx) && (idx < end)) ? is_valid : d_strings.is_valid(idx);
-      },
-      stream,
-      mr);
-  }();
-
-  auto const d_value = const_cast<string_scalar&>(value);
-  auto const d_str   = is_valid ? d_value.value(stream) : string_view{};
-  auto fn            = fill_fn{d_strings, begin, end, d_str};
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_value   = cudf::get_scalar_device_view(const_cast<string_scalar&>(value));
 
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto fn = fill_fn{*d_strings, begin, end, d_value};
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    fn);
 
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index aaaa751c3f9..a34828fa97e 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
@@ -56,8 +57,9 @@ struct filter_fn {
   rmm::device_uvector<char_range>::iterator table_begin;
   rmm::device_uvector<char_range>::iterator table_end;
   string_view const d_replacement;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return true if this character should be removed.
@@ -86,7 +88,7 @@ struct filter_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -103,7 +105,7 @@ struct filter_fn {
       else
         nbytes += d_newchar.size_bytes() - char_size;
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -118,7 +120,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -139,8 +141,7 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -160,7 +161,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 93e00592ef2..4df1b9b4ffe 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -123,7 +124,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              PatternIterator const patterns_itr,
                              string_view const& d_escape,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -151,7 +152,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(pattern.is_valid(stream), "Parameter pattern must be valid");
   CUDF_EXPECTS(escape_character.is_valid(stream), "Parameter escape_character must be valid");
@@ -166,7 +167,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(patterns.size() == input.size(), "Number of patterns must match the input size");
   CUDF_EXPECTS(patterns.has_nulls() == false, "Parameter patterns must not contain nulls");
@@ -186,7 +187,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, pattern, escape_character, stream, mr);
@@ -196,7 +197,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, patterns, escape_character, stream, mr);
diff --git a/cpp/src/strings/merge/merge.cu b/cpp/src/strings/merge/merge.cu
new file mode 100644
index 00000000000..28e171f157e
--- /dev/null
+++ b/cpp/src/strings/merge/merge.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/merge.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  using cudf::detail::side;
+  if (row_order.is_empty()) { return make_empty_column(type_id::STRING); }
+  auto const strings_count = static_cast<cudf::size_type>(row_order.size());
+
+  auto const lhs_column = column_device_view::create(lhs.parent(), stream);
+  auto const d_lhs      = *lhs_column;
+  auto const rhs_column = column_device_view::create(rhs.parent(), stream);
+  auto const d_rhs      = *rhs_column;
+
+  auto const begin = row_order.begin();
+
+  // build vector of strings
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    [d_lhs, d_rhs, begin] __device__(size_type idx) {
+                      auto const [s, index] = begin[idx];
+                      if (s == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) {
+                        return string_index_pair{nullptr, 0};
+                      }
+                      auto d_str = (s == side::LEFT) ? d_lhs.element<string_view>(index)
+                                                     : d_rhs.element<string_view>(index);
+                      return d_str.size_bytes() == 0
+                               ? string_index_pair{"", 0}  // ensures empty != null
+                               : string_index_pair{d_str.data(), d_str.size_bytes()};
+                    });
+
+  // convert vector into strings column
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 85d47af87f6..0d146108436 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -46,8 +47,9 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
     : d_column(d_column), width(width), fill_char_size(fill_char_size)
@@ -57,7 +59,7 @@ struct base_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -66,7 +68,7 @@ struct base_fn {
     if (d_chars) {
       derived.pad(d_str, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size);
+      d_sizes[idx] = compute_padded_size(d_str, width, fill_char_size);
     }
   };
 };
@@ -103,7 +105,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
@@ -146,7 +148,7 @@ struct zfill_fn : base_fn<zfill_fn> {
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -170,7 +172,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pad(input, width, side, fill_char, stream, mr);
@@ -179,7 +181,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::zfill(input, width, stream, mr);
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index ce12dc17aa4..10e06505094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
                                             : reinterpret_cast<reprog_device*>(buffer)[0];
 }
 
+__device__ __forceinline__ static string_view::const_iterator find_char(
+  cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
+{
+  while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
+    ++itr;
+  }
+  return itr;
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case BOL:
           if (pos == 0) break;
           if (jnk.startchar != '^') { return thrust::nullopt; }
-          --pos;
+          --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
-          auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return thrust::nullopt; }
-          pos = fidx + (jnk.starttype == BOL);
+          auto const find_itr = find_char(startchar, dstr, itr);
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          itr = find_itr + (jnk.starttype == BOL);
+          pos = itr.position();
           break;
         }
       }
-      itr += (pos - itr.position());  // faster to increment position
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index ae8211ac916..afbfe9de049 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -19,12 +19,15 @@
 #include "strings/regex/regex.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 
@@ -113,12 +116,10 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            reprog_device& d_prog,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
-  auto offsets = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets             = offsets->mutable_view().template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
 
   auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count);
 
@@ -132,12 +133,11 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
     for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
       size_and_exec_fn, d_prog, strings_count);
   }
-
-  auto const char_bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
+  // Convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // Now build the chars column
   rmm::device_uvector<char> chars(char_bytes, stream, mr);
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 690a72c098f..022f1eb3232 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/repeat_strings.hpp>
@@ -28,6 +29,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -42,7 +44,7 @@ namespace detail {
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   if (!input.is_valid(stream)) { return std::make_unique<string_scalar>("", false, stream, mr); }
   if (input.size() == 0 || repeat_times <= 0) {
@@ -79,7 +81,7 @@ namespace {
 auto generate_empty_output(strings_column_view const& input,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -106,22 +108,26 @@ struct compute_size_and_repeat_fn {
   column_device_view const strings_dv;
   size_type const repeat_times;
   bool const has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  // `idx` will be in the range of [0, repeat_times * strings_count).
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Thread index in the range [0,repeat_times * strings_count)
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const str_idx    = idx / repeat_times;  // value cycles in [0, string_count)
     auto const repeat_idx = idx % repeat_times;  // value cycles in [0, repeat_times)
     auto const is_valid   = !has_nulls || strings_dv.is_valid_nocheck(str_idx);
 
     if (!d_chars && repeat_idx == 0) {
-      d_offsets[str_idx] =
+      d_sizes[str_idx] =
         is_valid ? repeat_times * strings_dv.element<string_view>(str_idx).size_bytes() : 0;
     }
 
@@ -143,7 +149,7 @@ struct compute_size_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -181,14 +187,19 @@ struct compute_sizes_and_repeat_fn {
   Iterator const repeat_times_iter;
   bool const strings_has_nulls;
   bool const rtimes_has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Row index
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx);
     auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx);
@@ -196,7 +207,7 @@ struct compute_sizes_and_repeat_fn {
     // Any null input (either string or repeat_times value) will result in a null output.
     auto const is_valid = string_is_valid && rtimes_is_valid;
     if (!is_valid) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -205,7 +216,7 @@ struct compute_sizes_and_repeat_fn {
 
     if (!d_chars) {
       // repeat_times could be negative
-      d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0;
+      d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes();
     } else {
       auto output_ptr = d_chars + d_offsets[idx];
       while (repeat_times-- > 0) {
@@ -220,7 +231,7 @@ struct compute_sizes_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size.");
   CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()),
@@ -256,7 +267,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_string(input, repeat_times, stream, mr);
@@ -265,7 +276,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
@@ -274,7 +285,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 8e20db18f43..86afe4c8b9b 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <regex>
 
@@ -105,7 +106,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -148,7 +149,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_with_backrefs(strings, prog, replacement, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index edd85f29e6c..b5b75cf8f40 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -45,13 +45,14 @@ struct backrefs_fn {
   string_view const d_repl;  // string replacement template
   Iterator backrefs_begin;
   Iterator backrefs_end;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_strings.element<string_view>(idx);
@@ -113,7 +114,7 @@ struct backrefs_fn {
       thrust::copy_n(
         thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
new file mode 100644
index 00000000000..79bf6e3c910
--- /dev/null
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/replace.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+struct find_replace_fn {
+  column_device_view d_input;
+  column_device_view d_values;
+  column_device_view d_replacements;
+
+  __device__ string_index_pair get_replacement(size_type idx)
+  {
+    if (d_replacements.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = d_replacements.element<string_view>(idx);
+    return string_index_pair{d_str.data(), d_str.size_bytes()};
+  }
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    if (d_input.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = d_input.element<string_view>(idx);
+    // find d_str in d_values
+    // if found return corresponding replacement
+    // if not found, return d_str
+    auto const begin = thrust::counting_iterator<size_type>(0);
+    auto const end   = thrust::counting_iterator<size_type>(d_values.size());
+    auto const itr =
+      thrust::find_if(thrust::seq, begin, end, [d_values = d_values, d_str](size_type i) -> bool {
+        return d_str == d_values.element<string_view>(i);
+      });
+    return itr == end ? string_index_pair{d_str.data(), d_str.size_bytes()} : get_replacement(*itr);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> find_and_replace_all(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& values_to_replace,
+  cudf::strings_column_view const& replacement_values,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto d_input             = cudf::column_device_view::create(input.parent(), stream);
+  auto d_values_to_replace = cudf::column_device_view::create(values_to_replace.parent(), stream);
+  auto d_replacements      = cudf::column_device_view::create(replacement_values.parent(), stream);
+
+  auto indices = rmm::device_uvector<string_index_pair>(input.size(), stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    find_replace_fn{*d_input, *d_values_to_replace, *d_replacements});
+
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 8b5a4317b50..9025234aa52 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
@@ -33,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -42,6 +44,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
@@ -67,7 +70,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
  * @brief Type used for holding the target position (first) and the
  * target index (second).
  */
-using target_pair = thrust::pair<size_type, size_type>;
+using target_pair = thrust::tuple<int64_t, size_type>;
 
 /**
  * @brief Helper functions for performing character-parallel replace
@@ -75,12 +78,6 @@ using target_pair = thrust::pair<size_type, size_type>;
 struct replace_multi_parallel_fn {
   __device__ char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ size_type const* get_offsets_ptr() const
-  {
-    return d_strings.child(strings_column_view::offsets_column_index).data<size_type>() +
-           d_strings.offset();
-  }
-
   __device__ string_view const get_string(size_type idx) const
   {
     return d_strings.element<string_view>(idx);
@@ -100,11 +97,12 @@ struct replace_multi_parallel_fn {
    * @param idx Index of the byte position in the chars column
    * @param chars_bytes Number of bytes in the chars column
    */
-  __device__ thrust::optional<size_type> has_target(size_type idx, size_type chars_bytes) const
+  __device__ size_type target_index(int64_t idx, int64_t chars_bytes) const
   {
-    auto const d_offsets = get_offsets_ptr();
+    auto const d_offsets = d_strings_offsets;
     auto const d_chars   = get_base_ptr() + d_offsets[0] + idx;
     size_type str_idx    = -1;
+    string_view d_str{};
     for (std::size_t t = 0; t < d_targets.size(); ++t) {
       auto const d_tgt = d_targets[t];
       if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
@@ -113,12 +111,24 @@ struct replace_multi_parallel_fn {
           auto const idx_itr =
             thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
           str_idx = thrust::distance(d_offsets, idx_itr) - 1;
+          d_str   = get_string(str_idx - d_offsets[0]);
         }
-        auto const d_str = get_string(str_idx - d_offsets[0]);
         if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; }
       }
     }
-    return thrust::nullopt;
+    return -1;
+  }
+
+  __device__ bool has_target(int64_t idx, int64_t chars_bytes) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    for (auto& d_tgt : d_targets) {
+      if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
+          (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+        return true;
+      }
+    }
+    return false;
   }
 
   /**
@@ -133,28 +143,32 @@ struct replace_multi_parallel_fn {
    * @return Number of substrings resulting from the replace operations on this row
    */
   __device__ size_type count_strings(size_type idx,
-                                     target_pair const* d_positions,
-                                     size_type const* d_targets_offsets) const
+                                     int64_t const* d_positions,
+                                     size_type const* d_indices,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
   {
     if (!is_valid(idx)) { return 0; }
 
-    auto const d_str             = get_string(idx);
-    auto const d_str_end         = d_str.data() + d_str.size_bytes();
-    auto const base_ptr          = get_base_ptr();
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type count = 1;  // always at least one string
     auto str_ptr    = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { count++; }  // don't bother counting empty strings
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) { count++; }
 
         str_ptr += keep_size + d_tgt.size_bytes();
@@ -182,9 +196,10 @@ struct replace_multi_parallel_fn {
    * @return The size in bytes of the output string for this row
    */
   __device__ size_type get_strings(size_type idx,
-                                   size_type const* d_offsets,
-                                   target_pair const* d_positions,
-                                   size_type const* d_targets_offsets,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   size_type const* d_indices,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
                                    string_index_pair* d_all_strings) const
   {
     if (!is_valid(idx)) { return 0; }
@@ -194,22 +209,24 @@ struct replace_multi_parallel_fn {
     auto const d_str_end = d_str.data() + d_str.size_bytes();
     auto const base_ptr  = get_base_ptr();
 
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type output_idx  = 0;
     size_type output_size = 0;
     auto str_ptr          = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
         output_size += keep_size;
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) {
           d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()};
         }
@@ -228,14 +245,19 @@ struct replace_multi_parallel_fn {
   }
 
   replace_multi_parallel_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
                             device_span<string_view const> d_targets,
                             device_span<string_view const> d_replacements)
-    : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements}
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      d_targets{d_targets},
+      d_replacements{d_replacements}
   {
   }
 
  protected:
   column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
   device_span<string_view const> d_targets;
   device_span<string_view const> d_replacements;
 };
@@ -247,128 +269,113 @@ struct replace_multi_parallel_fn {
  * (this happens sometimes when passing device lambdas to thrust algorithms)
  */
 struct pair_generator {
-  __device__ target_pair operator()(int idx) const
+  __device__ target_pair operator()(int64_t idx) const
   {
-    auto pos = fn.has_target(idx, chars_bytes);
-    return target_pair{idx, pos.value_or(-1)};
+    return thrust::make_tuple(idx, fn.target_index(idx, chars_bytes));
   }
   replace_multi_parallel_fn fn;
-  size_type chars_bytes;
+  int64_t chars_bytes;
 };
 
 struct copy_if_fn {
-  __device__ bool operator()(target_pair pos) { return pos.second >= 0; }
+  __device__ bool operator()(target_pair pos) { return thrust::get<1>(pos) >= 0; }
 };
 
 std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
                                                    strings_column_view const& targets,
                                                    strings_column_view const& repls,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   auto const strings_count = input.size();
   auto const chars_bytes =
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
+    get_offset_value(input.offsets(), input.offset(), stream);
 
   auto d_targets =
     create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
   auto d_replacements =
     create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
 
-  replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements};
+  replace_multi_parallel_fn fn{
+    *d_strings,
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()),
+    d_targets,
+    d_replacements,
+  };
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<int64_t>(0),
+    thrust::make_counting_iterator<int64_t>(chars_bytes),
+    [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); });
 
-  // count the number of targets in the entire column
-  auto const target_count = thrust::count_if(rmm::exec_policy(stream),
-                                             thrust::make_counting_iterator<size_type>(0),
-                                             thrust::make_counting_iterator<size_type>(chars_bytes),
-                                             [fn, chars_bytes] __device__(size_type idx) {
-                                               return fn.has_target(idx, chars_bytes).has_value();
-                                             });
   // Create a vector of every target position in the chars column.
-  // These may include overlapping targets which will be resolved later.
-  auto targets_positions = rmm::device_uvector<target_pair>(target_count, stream);
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto targets_indices   = rmm::device_uvector<size_type>(target_count, stream);
+
+  // cudf::detail::make_counting_transform_iterator hardcodes size_type
+  auto const copy_itr = thrust::make_transform_iterator(thrust::counting_iterator<int64_t>(0),
+                                                        pair_generator{fn, chars_bytes});
+  auto const out_itr  = thrust::make_zip_iterator(
+    thrust::make_tuple(targets_positions.begin(), targets_indices.begin()));
+  auto const copy_end =
+    cudf::detail::copy_if_safe(copy_itr, copy_itr + chars_bytes, out_itr, copy_if_fn{}, stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(static_cast<int64_t>(std::distance(out_itr, copy_end)), target_count);
+  targets_positions.resize(target_count, stream);
+  targets_indices.resize(target_count, stream);
   auto d_positions       = targets_positions.data();
-
-  auto const copy_itr =
-    cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes});
-  auto const copy_end = thrust::copy_if(
-    rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{});
+  auto d_targets_indices = targets_indices.data();
 
   // create a vector of offsets to each string's set of target positions
-  auto const targets_offsets = [&] {
-    auto string_indices = rmm::device_uvector<size_type>(target_count, stream);
-
-    auto const pos_itr = cudf::detail::make_counting_transform_iterator(
-      0, cuda::proclaim_return_type<int64_t>([d_positions] __device__(auto idx) -> int64_t {
-        return d_positions[idx].first;
-      }));
-    auto pos_count = std::distance(d_positions, copy_end);
-
-    auto begin =
-      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
-    auto end = begin + input.offsets().size();
-    thrust::upper_bound(
-      rmm::exec_policy(stream), begin, end, pos_itr, pos_itr + pos_count, string_indices.begin());
-
-    // compute offsets per string
-    auto targets_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
-    auto d_targets_offsets = targets_offsets.data();
-
-    // memset to zero-out the target counts for any null-entries or strings with no targets
-    thrust::uninitialized_fill(
-      rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0);
-
-    // next, count the number of targets per string
-    auto d_string_indices = string_indices.data();
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       target_count,
-                       [d_string_indices, d_targets_offsets] __device__(size_type idx) {
-                         auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_targets_offsets + str_idx, 1);
-                       });
-    // finally, convert the counts into offsets
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           targets_offsets.begin(),
-                           targets_offsets.end(),
-                           targets_offsets.begin());
-    return targets_offsets;
-  }();
-  auto const d_targets_offsets = targets_offsets.data();
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
   // compute the number of string segments produced by replace in each string
   auto counts = rmm::device_uvector<size_type>(strings_count, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
                     counts.begin(),
                     cuda::proclaim_return_type<size_type>(
-                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
-                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      [fn, d_positions, d_targets_indices, d_targets_offsets] __device__(
+                        size_type idx) -> size_type {
+                        return fn.count_strings(
+                          idx, d_positions, d_targets_indices, d_targets_offsets);
                       }));
 
   // create offsets from the counts
-  auto offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
-  auto const total_strings =
-    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-  auto const d_strings_offsets = offsets->view().data<size_type>();
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of all the positions for all the strings
   auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
   auto d_indices = indices.data();
   auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
-    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
-      size_type idx) {
-      d_sizes[idx] =
-        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
+    [fn,
+     d_strings_offsets,
+     d_positions,
+     d_targets_indices,
+     d_targets_offsets,
+     d_indices,
+     d_sizes] __device__(size_type idx) {
+      d_sizes[idx] = fn.get_strings(
+        idx, d_strings_offsets, d_positions, d_targets_indices, d_targets_offsets, d_indices);
     });
 
   // use this utility to gather the string parts into a contiguous chars column
@@ -376,8 +383,8 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
   auto chars_data = chars->release().data;
 
   // create offsets from the sizes
-  offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
   // build the strings columns from the chars and offsets
   return make_strings_column(strings_count,
@@ -397,13 +404,14 @@ struct replace_multi_fn {
   column_device_view const d_strings;
   column_device_view const d_targets;
   column_device_view const d_repls;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -436,9 +444,11 @@ struct replace_multi_fn {
       ++spos;
     }
     if (out_ptr)  // copy remainder
+    {
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
+    } else {
+      d_sizes[idx] = bytes;
+    }
   }
 };
 
@@ -446,13 +456,13 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
                                                 strings_column_view const& targets,
                                                 strings_column_view const& repls,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto d_strings      = column_device_view::create(input.parent(), stream);
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
@@ -468,7 +478,7 @@ std::unique_ptr<column> replace(strings_column_view const& input,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
@@ -493,7 +503,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, targets, repls, stream, mr);
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 743e5894112..cd60a4296b9 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -55,13 +56,14 @@ struct replace_multi_regex_fn {
   device_span<reprog_device const> progs;  // array of regex progs
   found_range* d_found_ranges;             // working array matched (begin,end) values
   column_device_view const d_repls;        // replacement strings
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -128,7 +130,7 @@ struct replace_multi_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
@@ -140,7 +142,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (patterns.empty()) {  // if no patterns; just return a copy
@@ -207,7 +209,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, patterns, replacements, flags, stream, mr);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 1f752f543d0..501e6d547e6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -14,23 +14,25 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -39,11 +41,7 @@
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/remove.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -52,505 +50,376 @@ namespace detail {
 namespace {
 
 /**
- * @brief Average string byte-length threshold for deciding character-level vs row-level parallel
- * algorithm.
+ * @brief Threshold to decide on using string or character-parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the character-parallel function is used.
+ * Otherwise, a regular string-parallel function is used.
  *
- * This value was determined by running the replace string scalar benchmark against different
- * power-of-2 string lengths and observing the point at which the performance only improved for
- * all trials.
+ * This value was found using the replace-multi benchmark results using an
+ * RTX A6000.
  */
-constexpr size_type BYTES_PER_VALID_ROW_THRESHOLD = 64;
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
 
 /**
- * @brief Function logic for the row-level parallelism replace API.
- *
- * This will perform a replace operation on each string.
+ * @brief Helper functions for performing character-parallel replace
  */
-struct replace_row_parallel_fn {
-  column_device_view const d_strings;
-  string_view const d_target;
-  string_view const d_repl;
-  int32_t const max_repl;
-  int32_t* d_offsets{};
-  char* d_chars{};
+struct replace_parallel_chars_fn {
+  __device__ inline char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ void operator()(size_type idx)
+  __device__ inline string_view const get_string(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str   = d_strings.element<string_view>(idx);
-    char const* in_ptr = d_str.data();
-
-    char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto max_n    = (max_repl < 0) ? d_str.length() : max_repl;
-    auto bytes    = d_str.size_bytes();
-    auto position = d_str.find(d_target);
-
-    size_type last_pos = 0;
-    while ((position != string_view::npos) && (max_n > 0)) {
-      if (out_ptr) {
-        auto const curr_pos = d_str.byte_offset(position);
-        out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos);  // copy left
-        out_ptr = copy_string(out_ptr, d_repl);                                         // copy repl
-        last_pos = curr_pos + d_target.size_bytes();
-      } else {
-        bytes += d_repl.size_bytes() - d_target.size_bytes();
-      }
-      position = d_str.find(d_target, position + d_target.length());
-      --max_n;
-    }
-    if (out_ptr)  // copy whats left (or right depending on your point of view)
-      memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = bytes;
+    return d_strings.element<string_view>(idx);
   }
-};
 
-/**
- * @brief Functor for detecting falsely-overlapped target positions.
- *
- * This functor examines target positions that have been flagged as potentially overlapped by
- * a previous target position and identifies the overlaps that are false. A false overlap can occur
- * when a target position is overlapped by another target position that is itself overlapped.
- *
- * For example, a target string of "+++" and string to search of "++++++" will generate 4 potential
- * target positions at char offsets 0 through 3. The targets at offsets 1, 2, and 3 will be flagged
- * as potential overlaps since a prior target position is within range of the target string length.
- * The targets at offset 1 and 2 are true overlaps, since the footprint of the valid target at
- * offset 0 overlaps with them. The target at offset 3 is not truly overlapped because it is only
- * overlapped by invalid targets, targets that were themselves overlapped by a valid target.
- */
-struct target_false_overlap_filter_fn {
-  size_type const* const d_overlap_pos_indices{};
-  size_type const* const d_target_positions{};
-  size_type const target_size{};
+  __device__ inline bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
 
-  __device__ bool operator()(size_type overlap_idx) const
+  /**
+   * @brief Returns true if the target string is found at the given byte position
+   * in the input strings column and is legally within a string row
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool is_target_within_row(int64_t idx) const
   {
-    if (overlap_idx == 0) {
-      // The first overlap has no prior overlap to chain, so it should be kept as an overlap.
-      return false;
+    auto const d_offsets = d_strings_offsets;
+    auto const d_chars   = get_base_ptr() + idx;
+    auto const d_tgt     = d_target;
+    auto const chars_end = chars_bytes + d_offsets[0];
+    if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_end) &&
+        (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+      auto const idx_itr =
+        thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
+      auto str_idx = static_cast<size_type>(thrust::distance(d_offsets, idx_itr) - 1);
+      auto d_str   = get_string(str_idx);
+      if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return true; }
     }
+    return false;
+  }
 
-    size_type const this_pos_idx = d_overlap_pos_indices[overlap_idx];
-
-    // Searching backwards for the first target position index of an overlap that is not adjacent
-    // to its overlap predecessor. The result will be the first overlap in this chain of overlaps.
-    size_type first_overlap_idx = overlap_idx;
-    size_type first_pos_idx     = this_pos_idx;
-    while (first_overlap_idx > 0) {
-      size_type prev_pos_idx = d_overlap_pos_indices[--first_overlap_idx];
-      if (prev_pos_idx + 1 != first_pos_idx) { break; }
-      first_pos_idx = prev_pos_idx;
-    }
+  /**
+   * @brief Returns true if the target string found at the given byte position
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool has_target(int64_t idx) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    return (!d_target.empty() && (idx + d_target.size_bytes() <= chars_bytes) &&
+            (d_target.compare(d_chars, d_target.size_bytes()) == 0));
+  }
 
-    // The prior target position to the first overlapped position in the chain is a valid target.
-    size_type valid_pos_idx = first_pos_idx - 1;
-    size_type valid_pos     = d_target_positions[valid_pos_idx];
-
-    // Walk forward from this valid target. Any targets within the range of this valid one are true
-    // overlaps. The first overlap beyond the range of this valid target is another valid target,
-    // as it was falsely overlapped by a target that was itself overlapped. Repeat until we get to
-    // the overlapped position being queried by this call.
-    while (valid_pos_idx < this_pos_idx) {
-      size_type next_pos_idx = valid_pos_idx + 1;
-      size_type next_pos     = d_target_positions[next_pos_idx];
-      // Every target position within the range of a valid target position is a true overlap.
-      while (next_pos < valid_pos + target_size) {
-        if (next_pos_idx == this_pos_idx) { return false; }
-        next_pos = d_target_positions[++next_pos_idx];
+  /**
+   * @brief Count the number of strings that will be produced by the replace
+   *
+   * This includes segments of the string that are not replaced as well as those
+   * that are replaced.
+   *
+   * @param idx Index of the row in d_strings to be processed
+   * @param d_positions Positions of the targets found in the chars column
+   * @param d_targets_offsets Offsets identify which target positions go with the current string
+   * @return Number of substrings resulting from the replace operations on this row
+   */
+  __device__ size_type count_strings(size_type idx,
+                                     int64_t const* d_positions,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type count = 1;  // always at least one string
+    auto str_ptr    = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { count++; }  // don't bother counting empty strings
+        if (!d_replacement.empty()) { count++; }
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
-      valid_pos_idx = next_pos_idx;
-      valid_pos     = next_pos;
     }
-
-    // This was overlapped only by false overlaps and therefore is a valid target.
-    return true;
+    return count;
   }
-};
 
-/**
- * @brief Functor for replacing each target string with the replacement string.
- *
- * This will perform a replace operation at each target position.
- */
-struct target_replacer_fn {
-  device_span<size_type const> const d_target_positions;
-  char const* const d_in_chars{};
-  char* const d_out_chars{};
-  size_type const target_size{};
-  string_view const d_repl;
-  int32_t const in_char_offset = 0;
-
-  __device__ void operator()(size_type input_idx) const
+  /**
+   * @brief Retrieve the strings for each row
+   *
+   * This will return string segments as string_index_pair objects for
+   * parts of the string that are not replaced interlaced with the
+   * appropriate replacement string where replacement targets are found.
+   *
+   * This function is called only once to produce both the string_index_pair objects
+   * and the output row size in bytes.
+   *
+   * @param idx Index of the row in d_strings
+   * @param d_offsets Offsets to identify where to store the results of the replace for this string
+   * @param d_positions The target positions found in the chars column
+   * @param d_targets_offsets The offsets to identify which target positions go with this string
+   * @param d_all_strings The output of all the produced string segments
+   * @return The size in bytes of the output string for this row
+   */
+  __device__ size_type get_strings(size_type idx,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
+                                   string_index_pair* d_all_strings) const
   {
-    // Calculate the adjustment from input index to output index for each prior target position.
-    auto const repl_size         = d_repl.size_bytes();
-    auto const idx_delta_per_pos = repl_size - target_size;
-
-    // determine the number of target positions at or before this character position
-    size_type const* next_target_pos_ptr = thrust::upper_bound(
-      thrust::seq, d_target_positions.begin(), d_target_positions.end(), input_idx);
-    size_type const num_prev_targets = next_target_pos_ptr - d_target_positions.data();
-    size_type output_idx = input_idx - in_char_offset + idx_delta_per_pos * num_prev_targets;
-
-    if (num_prev_targets == 0) {
-      // not within a target string
-      d_out_chars[output_idx] = d_in_chars[input_idx];
-    } else {
-      // check if this input position is within a target string
-      size_type const prev_target_pos = *(next_target_pos_ptr - 1);
-      size_type target_idx            = input_idx - prev_target_pos;
-      if (target_idx < target_size) {
-        // within the target string, so the original calculation was off by one target string
-        output_idx -= idx_delta_per_pos;
-
-        // Copy the corresponding byte from the replacement string. If the replacement string is
-        // larger than the target string then the thread reading the last target byte is
-        // responsible for copying the remainder of the replacement string.
-        if (target_idx < repl_size) {
-          d_out_chars[output_idx++] = d_repl.data()[target_idx++];
-          if (target_idx == target_size) {
-            memcpy(d_out_chars + output_idx, d_repl.data() + target_idx, repl_size - target_idx);
-          }
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_output  = d_all_strings + d_offsets[idx];
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type output_idx  = 0;
+    size_type output_size = 0;
+    auto str_ptr          = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
+        output_size += keep_size;
+
+        if (!d_replacement.empty()) {
+          d_output[output_idx++] =
+            string_index_pair{d_replacement.data(), d_replacement.size_bytes()};
         }
-      } else {
-        // not within a target string
-        d_out_chars[output_idx] = d_in_chars[input_idx];
+        output_size += d_replacement.size_bytes();
+
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
     }
+    // include any leftover parts of the string
+    if (str_ptr <= d_str_end) {
+      auto const left_size = static_cast<size_type>(thrust::distance(str_ptr, d_str_end));
+      d_output[output_idx] = string_index_pair{str_ptr, left_size};
+      output_size += left_size;
+    }
+    return output_size;
+  }
+
+  replace_parallel_chars_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
+                            int64_t chars_bytes,
+                            string_view d_target,
+                            string_view d_replacement,
+                            cudf::size_type maxrepl)
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      chars_bytes(chars_bytes),
+      d_target{d_target},
+      d_replacement{d_replacement},
+      maxrepl(maxrepl)
+  {
   }
+
+ protected:
+  column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
+  int64_t chars_bytes;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
 };
 
-/**
- * @brief Filter target positions that are overlapped by other, valid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are overlapped by other, valid target positions. For example, if the target string is "++"
- * and the string to search is "+++" then there will be two potential targets at character offsets
- * 0 and 1. The target at offset 0 is valid and overlaps the target at offset 1, invalidating the
- * target at offset 1.
- *
- * @param[in,out] d_target_positions Potential target positions to filter in-place.
- * @param[in]     target_count       Number of potential target positions.
- * @param[in]     target_size        Size of the target string in bytes.
- * @param[in]     stream             CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_overlap_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          size_type target_size,
-                                          rmm::cuda_stream_view stream)
+std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
+                                                   string_view const& d_target,
+                                                   string_view const& d_replacement,
+                                                   cudf::size_type maxrepl,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
 {
-  auto overlap_detector = [d_target_positions, target_size] __device__(size_type pos_idx) -> bool {
-    return (pos_idx > 0)
-             ? d_target_positions[pos_idx] - d_target_positions[pos_idx - 1] < target_size
-             : false;
-  };
-
-  // count the potential number of overlapped target positions
-  size_type overlap_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(target_count),
-                     overlap_detector);
-  if (overlap_count == 0) { return target_count; }
-
-  // create a vector indexing the potential overlapped target positions
-  rmm::device_uvector<size_type> potential_overlapped_pos_indices(overlap_count, stream);
-  auto d_potential_overlapped_pos_indices = potential_overlapped_pos_indices.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(0),
-                  thrust::make_counting_iterator<size_type>(target_count),
-                  d_potential_overlapped_pos_indices,
-                  overlap_detector);
-
-  // filter out the false overlaps that are actually valid
-  rmm::device_uvector<size_type> overlapped_pos_indices(overlap_count, stream);
-  auto d_overlapped_pos_indices = overlapped_pos_indices.data();
-  auto overlap_end =
-    thrust::remove_copy_if(rmm::exec_policy(stream),
-                           d_potential_overlapped_pos_indices,
-                           d_potential_overlapped_pos_indices + overlap_count,
-                           thrust::make_counting_iterator<size_type>(0),
-                           d_overlapped_pos_indices,
-                           target_false_overlap_filter_fn{
-                             d_potential_overlapped_pos_indices, d_target_positions, target_size});
-  overlap_count = cudf::distance(d_overlapped_pos_indices, overlap_end);
-
-  // In-place remove any target positions that are overlapped by valid target positions
-  auto target_pos_end = thrust::remove_if(
-    rmm::exec_policy(stream),
-    d_target_positions,
-    d_target_positions + target_count,
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto const strings_count = input.size();
+  auto const chars_offset  = get_offset_value(input.offsets(), input.offset(), stream);
+  auto const chars_bytes =
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) - chars_offset;
+
+  auto const offsets_begin =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+
+  replace_parallel_chars_fn fn{
+    *d_strings, offsets_begin, chars_bytes, d_target, d_replacement, maxrepl};
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream),
+                                       thrust::make_counting_iterator<int64_t>(0),
+                                       thrust::make_counting_iterator<int64_t>(chars_bytes),
+                                       [fn] __device__(int64_t idx) { return fn.has_target(idx); });
+
+  // Create a vector of every target position in the chars column.
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto const copy_itr    = thrust::counting_iterator<int64_t>(chars_offset);
+  auto const copy_end    = cudf::detail::copy_if_safe(
+    copy_itr,
+    copy_itr + chars_bytes + chars_offset,
+    targets_positions.begin(),
+    [fn] __device__(int64_t idx) { return fn.is_target_within_row(idx); },
+    stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(std::distance(targets_positions.begin(), copy_end), target_count);
+  targets_positions.resize(target_count, stream);
+  auto d_positions = targets_positions.data();
+
+  // create a vector of offsets to each string's set of target positions
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
+
+  // compute the number of string segments produced by replace in each string
+  auto counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    counts.begin(),
+                    cuda::proclaim_return_type<size_type>(
+                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
+                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      }));
+
+  // create offsets from the counts
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+
+  // build a vector of all the positions for all the strings
+  auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
+  auto d_indices = indices.data();
+  auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
-    [d_overlapped_pos_indices, overlap_count] __device__(size_type target_position_idx) -> bool {
-      return thrust::binary_search(thrust::seq,
-                                   d_overlapped_pos_indices,
-                                   d_overlapped_pos_indices + overlap_count,
-                                   target_position_idx);
+    strings_count,
+    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
+      size_type idx) {
+      d_sizes[idx] =
+        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
     });
-  return cudf::distance(d_target_positions, target_pos_end);
-}
 
-/**
- * @brief Filter target positions to remove any invalid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are invalid, either by the target string overlapping a row boundary or being overlapped by
- * another valid target string.
- *
- * @param[in,out] target_positions Potential target positions to filter in-place.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     target_size      Size of the target string in bytes.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_false_target_positions(rmm::device_uvector<size_type>& target_positions,
-                                        device_span<int32_t const> d_offsets_span,
-                                        size_type target_size,
-                                        rmm::cuda_stream_view stream)
-{
-  // In-place remove any positions for target strings that crossed string boundaries.
-  auto d_target_positions = target_positions.data();
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_positions.size(),
-                      [d_offsets_span, target_size] __device__(size_type target_pos) -> bool {
-                        // find the end of the string containing the start of this target
-                        size_type const* offset_ptr = thrust::upper_bound(
-                          thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-                        return target_pos + target_size > *offset_ptr;
-                      });
-  auto const target_count = cudf::distance(d_target_positions, target_pos_end);
-  if (target_count == 0) { return 0; }
-
-  // Filter out target positions that are the result of overlapping target matches.
-  return (target_count > 1)
-           ? filter_overlap_target_positions(d_target_positions, target_count, target_size, stream)
-           : target_count;
-}
+  // use this utility to gather the string parts into a contiguous chars column
+  auto chars      = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars_data = chars->release().data;
 
-/**
- * @brief Filter target positions beyond the maximum target replacements per row limit.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * corresponding to targets that should not be replaced due to the maximum target replacement per
- * row limit.
- *
- * @param[in,out] target_positions Target positions to filter in-place.
- * @param[in]     target_count     Number of target positions.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     max_repl_per_row Maximum target replacements per row limit.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_maxrepl_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          device_span<int32_t const> d_offsets_span,
-                                          size_type max_repl_per_row,
-                                          rmm::cuda_stream_view stream)
-{
-  auto pos_to_row_fn = cuda::proclaim_return_type<size_type>(
-    [d_offsets_span] __device__(size_type target_pos) -> size_type {
-      auto upper_bound =
-        thrust::upper_bound(thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-      return thrust::distance(d_offsets_span.begin(), upper_bound);
-    });
+  // create offsets from the sizes
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
-  // compute the match count per row for each target position
-  rmm::device_uvector<size_type> match_counts(target_count, stream);
-  auto d_match_counts = match_counts.data();
-  thrust::inclusive_scan_by_key(
-    rmm::exec_policy(stream),
-    thrust::make_transform_iterator(d_target_positions, pos_to_row_fn),
-    thrust::make_transform_iterator(d_target_positions + target_count, pos_to_row_fn),
-    thrust::make_constant_iterator<size_type>(1),
-    d_match_counts);
-
-  // In-place remove any positions that exceed the per-row match limit
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_count,
-                      d_match_counts,
-                      [max_repl_per_row] __device__(size_type match_count) -> bool {
-                        return match_count > max_repl_per_row;
-                      });
-
-  return cudf::distance(d_target_positions, target_pos_end);
+  // build the strings columns from the chars and offsets
+  return make_strings_column(strings_count,
+                             std::move(offsets),
+                             std::move(chars_data.release()[0]),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
- * @brief Scalar string replacement using a character-level parallel algorithm.
+ * @brief Function logic for the replace_string_parallel
  *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * character-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively long.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
- *
- * @param strings     String column to search for target strings.
- * @param chars_start Offset of the first character in the string column.
- * @param chars_end   Offset beyond the last character in the string column to search.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
+ * Performs the multi-replace operation with a thread per string.
+ * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD
  */
-std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings,
-                                              size_type chars_start,
-                                              size_type chars_end,
-                                              string_view const& d_target,
-                                              string_view const& d_repl,
-                                              int32_t maxrepl,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets   = strings.offsets().begin<int32_t>() + strings.offset();  // TODO: PR 14824
-  auto const d_in_chars  = strings.chars_begin(stream);
-  auto const chars_bytes = chars_end - chars_start;
-  auto const target_size = d_target.size_bytes();
-
-  // detect a target match at the specified byte position
-  device_span<char const> const d_chars_span(d_in_chars, chars_end);
-  auto target_detector = [d_chars_span, d_target] __device__(size_type char_idx) {
-    auto target_size = d_target.size_bytes();
-    auto target_ptr  = d_chars_span.begin() + char_idx;
-    return target_ptr + target_size <= d_chars_span.end() &&
-           d_target.compare(target_ptr, target_size) == 0;
-  };
-
-  // Count target string matches across all character positions, ignoring string boundaries and
-  // overlapping target strings. This may produce false-positives.
-  size_type target_count = thrust::count_if(rmm::exec_policy(stream),
-                                            thrust::make_counting_iterator<size_type>(chars_start),
-                                            thrust::make_counting_iterator<size_type>(chars_end),
-                                            target_detector);
-  if (target_count == 0) {
-    // nothing to replace, copy the input column
-    return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  }
+struct replace_fn {
+  column_device_view const d_strings;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
+  cudf::size_type* d_sizes{};
+  char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
-  // create a vector of the potential target match positions
-  rmm::device_uvector<size_type> target_positions(target_count, stream);
-  auto d_target_positions = target_positions.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(chars_start),
-                  thrust::make_counting_iterator<size_type>(chars_end),
-                  d_target_positions,
-                  target_detector);
-
-  device_span<int32_t const> d_offsets_span(d_offsets, offset_count);
-  if (target_size > 1) {
-    target_count =
-      filter_false_target_positions(target_positions, d_offsets_span, target_size, stream);
-    if (target_count == 0) {
-      // nothing to replace, copy the input column
-      return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_sizes[idx] = 0; }
+      return;
     }
-  }
+    auto const d_str   = d_strings.element<string_view>(idx);
+    char const* in_ptr = d_str.data();
 
-  // filter out any target positions that exceed the per-row match limit
-  if (maxrepl > 0 && target_count > maxrepl) {
-    target_count = filter_maxrepl_target_positions(
-      d_target_positions, target_count, d_offsets_span, maxrepl, stream);
+    size_type bytes = d_str.size_bytes();
+    size_type spos  = 0;
+    size_type lpos  = 0;
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto max_n      = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    // check each character against each target
+    while (spos < d_str.size_bytes() && (max_n > 0)) {
+      auto const d_tgt = d_target;
+      if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
+          (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+      {
+        auto const d_repl = d_replacement;
+        bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+        if (out_ptr) {
+          out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+          out_ptr = copy_string(out_ptr, d_repl);
+          lpos    = spos + d_tgt.size_bytes();
+        }
+        spos += d_tgt.size_bytes() - 1;
+        --max_n;
+      }
+      ++spos;
+    }
+    if (out_ptr) {  // copy remainder
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    } else {
+      d_sizes[idx] = bytes;
+    }
   }
+};
 
-  // build the offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view     = offsets_column->mutable_view();
-  auto delta_per_target = d_repl.size_bytes() - target_size;
-  device_span<size_type const> d_target_positions_span(d_target_positions, target_count);
-  auto offsets_update_fn = cuda::proclaim_return_type<int32_t>(
-    [d_target_positions_span, delta_per_target, chars_start] __device__(int32_t offset) -> int32_t {
-      // determine the number of target positions occurring before this offset
-      size_type const* next_target_pos_ptr = thrust::lower_bound(
-        thrust::seq, d_target_positions_span.begin(), d_target_positions_span.end(), offset);
-      size_type num_prev_targets =
-        thrust::distance(d_target_positions_span.data(), next_target_pos_ptr);
-      return offset - chars_start + delta_per_target * num_prev_targets;
-    });
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offsets_span.begin(),
-                    d_offsets_span.end(),
-                    offsets_view.begin<int32_t>(),
-                    offsets_update_fn);
-
-  // build the characters column
-  rmm::device_uvector<char> chars(chars_bytes + (delta_per_target * target_count), stream, mr);
-  auto d_out_chars = chars.data();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(chars_start),
-    chars_bytes,
-    target_replacer_fn{
-      d_target_positions_span, d_in_chars, d_out_chars, target_size, d_repl, chars_start});
-
-  // free the target positions buffer as it is no longer needed
-  (void)target_positions.release();
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
-}
-
-/**
- * @brief Scalar string replacement using a row-level parallel algorithm.
- *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * row-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively short.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
- *
- * @param strings     String column to search for target strings.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
- */
-std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
-                                             string_view const& d_target,
-                                             string_view const& d_repl,
-                                             int32_t maxrepl,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> replace_string_parallel(strings_column_view const& input,
+                                                string_view const& d_target,
+                                                string_view const& d_replacement,
+                                                cudf::size_type maxrepl,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::device_async_resource_ref mr)
 {
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
-  // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(
+    replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace
 
-std::unique_ptr<column> replace(strings_column_view const& strings,
+std::unique_ptr<column> replace(strings_column_view const& input,
                                 string_scalar const& target,
                                 string_scalar const& repl,
-                                int32_t maxrepl,
+                                cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-  if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  if (maxrepl == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -558,25 +427,11 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   string_view d_target(target.data(), target.size());
   string_view d_repl(repl.data(), repl.size());
 
-  // determine range of characters in the base column
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
-  size_type const chars_start =
-    (strings.offset() == 0)
-      ? 0
-      : cudf::detail::get_value<int32_t>(strings.offsets(), strings.offset(), stream);
-  size_type const chars_end   = (offset_count == strings.offsets().size())
-                                  ? strings.chars_size(stream)
-                                  : cudf::detail::get_value<int32_t>(
-                                    strings.offsets(), strings.offset() + strings_count, stream);
-  size_type const chars_bytes = chars_end - chars_start;
-
-  auto const avg_bytes_per_row = chars_bytes / std::max(strings_count - strings.null_count(), 1);
-  return (avg_bytes_per_row < BYTES_PER_VALID_ROW_THRESHOLD)
-           ? replace_row_parallel(strings, d_target, d_repl, maxrepl, stream, mr)
-           : replace_char_parallel(
-               strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr);
+  return (input.size() == input.null_count() ||
+          ((input.chars_size(stream) / (input.size() - input.null_count())) <
+           AVG_CHAR_BYTES_THRESHOLD))
+           ? replace_string_parallel(input, d_target, d_repl, maxrepl, stream, mr)
+           : replace_character_parallel(input, d_target, d_repl, maxrepl, stream, mr);
 }
 
 }  // namespace detail
@@ -588,7 +443,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& repl,
                                 cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, target, repl, maxrepl, stream, mr);
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index 26fb1c7819f..ffd9e6c2553 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -36,18 +37,18 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
+std::unique_ptr<column> replace_nulls(strings_column_view const& input,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  size_type strings_count = input.size();
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
 
   // build offsets column
@@ -58,12 +59,12 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
     }));
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
+  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // build chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) {
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index bded196946f..fd988855424 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -42,13 +43,14 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   string_view const d_repl;
   size_type const maxrepl;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -89,7 +91,7 @@ struct replace_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),  //             ^   ^
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
@@ -102,7 +104,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -135,7 +137,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr);
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 041801336e6..04d81218a16 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -44,13 +45,14 @@ struct replace_slice_fn {
   string_view const d_repl;
   size_type const start;
   size_type const stop;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -68,49 +70,52 @@ struct replace_slice_fn {
                                    in_ptr + end,
                                    d_str.size_bytes() - end);
     } else {
-      d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
+      d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
     }
   }
 };
 
 }  // namespace
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
-  if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  if (stop > 0) {
+    CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  }
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(
+    replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
+
 }  // namespace detail
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, stream, mr);
+  return detail::replace_slice(input, repl, start, stop, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index f9aec41b5e3..cbd231bc5f3 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,7 +59,7 @@ struct reverse_characters_fn {
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -81,7 +82,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
index 0cf492fa295..b3e45f65a21 100644
--- a/cpp/src/strings/scan/scan_inclusive.cu
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -82,7 +83,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto d_input = column_device_view::create(input, stream);
 
@@ -120,12 +121,12 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 598d48157d9..45eba39f413 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -208,7 +209,7 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
                                 size_type start,
                                 size_type stop,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
@@ -252,7 +253,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return find_fn<true>(input, target, start, stop, stream, mr);
 }
@@ -262,7 +263,7 @@ std::unique_ptr<column> rfind(strings_column_view const& input,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return find_fn<false>(input, target, start, stop, stream, mr);
 }
@@ -272,7 +273,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   CUDF_EXPECTS(input.size() == target.size(), "input and target columns must be the same size");
@@ -305,7 +306,7 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find(strings, target, start, stop, stream, mr);
@@ -316,7 +317,7 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rfind(strings, target, start, stop, stream, mr);
@@ -326,7 +327,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find<true>(input, target, start, stream, mr);
@@ -360,14 +361,22 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
-  // each thread of the warp will check just part of the string
-  auto found = false;
-  for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
+  // each warp processes 4 starting bytes
+  auto constexpr bytes_per_warp = 4;
+  auto found                    = false;
+  for (auto i = lane_idx * bytes_per_warp;
        !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
-       i += cudf::detail::warp_size) {
+       i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
-    if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
+    // this is definitely faster for very long strings > 128B
+    for (auto j = 0; j < bytes_per_warp; j++) {
+      if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
+          d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
+        found = true;
+      }
+    }
   }
+
   auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
@@ -375,7 +384,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
                                                string_scalar const& target,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   auto d_target = string_view(target.data(), target.size());
@@ -390,12 +399,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
 
   // fill the output with `false` unless the `d_target` is empty
   auto results_view = results->mutable_view();
-  thrust::fill(rmm::exec_policy(stream),
-               results_view.begin<bool>(),
-               results_view.end<bool>(),
-               d_target.empty());
-
-  if (!d_target.empty()) {
+  if (d_target.empty()) {
+    thrust::fill(
+      rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true);
+  } else {
     // launch warp per string
     auto const d_strings     = column_device_view::create(input.parent(), stream);
     constexpr int block_size = 256;
@@ -427,7 +434,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     string_scalar const& target,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -460,9 +467,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
                     [d_strings, pfn, d_target] __device__(size_type idx) {
-                      if (!d_strings.is_null(idx))
-                        return bool{pfn(d_strings.element<string_view>(idx), d_target)};
-                      return false;
+                      return !d_strings.is_null(idx) &&
+                             bool{pfn(d_strings.element<string_view>(idx), d_target)};
                     });
   results->set_null_count(strings.null_count());
   return results;
@@ -488,7 +494,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::BOOL8);
 
@@ -533,7 +539,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& input,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
@@ -551,7 +557,7 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) != string_view::npos;
@@ -562,7 +568,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -574,7 +580,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -586,7 +592,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -601,7 +607,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -620,7 +626,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, target, stream, mr);
@@ -629,7 +635,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, targets, stream, mr);
@@ -638,7 +644,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, target, stream, mr);
@@ -647,7 +653,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, targets, stream, mr);
@@ -656,7 +662,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, target, stream, mr);
@@ -665,7 +671,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, targets, stream, mr);
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index fcaec835f4d..223a941a88a 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ namespace detail {
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find_multiple(input, targets, stream, mr);
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 4e8e3a6a449..0d0962258cf 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -78,7 +79,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
                                      int64_t total_matches,
                                      size_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<string_index_pair> indices(total_matches, stream);
 
@@ -94,7 +95,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -128,7 +129,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::findall(input, prog, stream, mr);
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 98f3c9cae0d..cf82a837c51 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -26,8 +26,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -78,19 +81,20 @@ struct substring_fn {
   numeric_scalar_device_view<size_type> const d_start;
   numeric_scalar_device_view<size_type> const d_stop;
   numeric_scalar_device_view<size_type> const d_step;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     if (length == 0) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
@@ -130,7 +134,7 @@ struct substring_fn {
       }
       itr += step;
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -156,7 +160,7 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
                                                    IndexIterator starts,
                                                    IndexIterator stops,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto results = rmm::device_uvector<string_view>(d_column.size(), stream);
   thrust::transform(rmm::exec_policy(stream),
@@ -175,7 +179,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -218,7 +222,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -226,13 +230,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
                "Parameter stops must have the same number of rows as strings.");
-  CUDF_EXPECTS(starts_column.type() == stops_column.type(),
-               "Parameters starts and stops must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
+               "Parameters starts and stops must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls.");
   CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls.");
   CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(),
-               "Positions values must not be bool type.");
-  CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type.");
+               "Positions values must not be bool type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(is_fixed_width(starts_column.type()),
+               "Positions values must be fixed width type.",
+               cudf::data_type_error);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
@@ -249,7 +257,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, start, stop, step, stream, mr);
@@ -259,7 +267,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 16e6402cfef..93d55c494fe 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -184,7 +185,7 @@ struct rpartition_fn : public partition_fn {
 std::unique_ptr<table> partition(strings_column_view const& strings,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -211,7 +212,7 @@ std::unique_ptr<table> partition(strings_column_view const& strings,
 std::unique_ptr<table> rpartition(strings_column_view const& strings,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -242,7 +243,7 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
 std::unique_ptr<table> partition(strings_column_view const& input,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(input, delimiter, stream, mr);
@@ -251,7 +252,7 @@ std::unique_ptr<table> partition(strings_column_view const& input,
 std::unique_ptr<table> rpartition(strings_column_view const& input,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rpartition(input, delimiter, stream, mr);
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 1416b293b75..2c6a0b2cf22 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/fill.h>
@@ -111,7 +112,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> split_fn(strings_column_view const& input,
                                 Tokenizer tokenizer,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> results;
   if (input.size() == input.null_count()) {
@@ -329,7 +330,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                                            Tokenizer tokenizer,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // compute the number of tokens per string
   rmm::device_uvector<size_type> token_counts(strings_count, stream);
@@ -386,7 +387,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
@@ -427,7 +428,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -450,7 +451,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -477,7 +478,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split(strings_column, delimiter, maxsplit, stream, mr);
@@ -487,7 +488,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit(strings_column, delimiter, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 5f3c9372c39..69a11aabfcd 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -27,6 +27,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -294,7 +295,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr);
+                                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Helper function used by split/rsplit and split_record/rsplit_record
@@ -315,7 +316,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   strings_column_view const& input,
   Tokenizer tokenizer,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const chars_bytes =
@@ -364,8 +365,8 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     });
 
   // create offsets from the counts for return to the caller
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
-    token_counts.begin(), token_counts.end(), stream, mr);
+  auto [offsets, total_tokens] =
+    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr);
   auto const d_tokens_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 16725fe006a..d72ec1085b5 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -146,7 +147,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
   auto const begin = cudf::detail::make_counting_transform_iterator(0, map_fn);
   auto const end   = begin + strings_count;
 
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
+  auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
     begin, end, stream, rmm::mr::get_current_device_resource());
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
@@ -187,7 +188,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -218,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
-    [d_offsets] __device__(auto const idx) -> size_type {
+    cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
-    },
+    }),
     0,
     thrust::maximum<size_type>{});
 
@@ -258,7 +259,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         split_direction direction,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -298,7 +299,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -307,7 +308,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -316,7 +317,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -325,7 +326,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -338,7 +339,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_re(input, prog, maxsplit, stream, mr);
@@ -348,7 +349,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record_re(input, prog, maxsplit, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_re(input, prog, maxsplit, stream, mr);
@@ -368,7 +369,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 0971069592e..3e8be750b9e 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -46,7 +47,7 @@ template <typename Tokenizer>
 std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                                         Tokenizer tokenizer,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) {
     return cudf::lists::detail::make_empty_lists_column(data_type{type_id::STRING}, stream, mr);
@@ -142,7 +143,7 @@ template <typename TokenReader>
 std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& input,
                                                    TokenReader reader,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // create offsets column by counting the number of tokens per string
   auto sizes_itr = cudf::detail::make_counting_transform_iterator(
@@ -176,7 +177,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -210,7 +211,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::FORWARD>(strings, delimiter, maxsplit, stream, mr);
@@ -220,7 +221,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       string_scalar const& delimiter,
                                       size_type maxsplit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::BACKWARD>(
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 0f1b9e3baae..a298285f841 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
@@ -49,7 +50,7 @@ struct string_view_to_pair {
 std::unique_ptr<column> make_strings_column(
   device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
                                             string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 3ae97a00bbf..32671669093 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -35,16 +35,6 @@ column_view strings_column_view::offsets() const
   return child(offsets_column_index);
 }
 
-strings_column_view::offset_iterator strings_column_view::offsets_begin() const
-{
-  return offsets().begin<int32_t>() + offset();
-}
-
-strings_column_view::offset_iterator strings_column_view::offsets_end() const
-{
-  return offsets().begin<int32_t>() + offset() + size() + 1;
-}
-
 int64_t strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
 {
   if (size() == 0) { return 0L; }
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 9c7f905cb0b..233fee14694 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,13 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
 std::unique_ptr<scalar> make_string_scalar(std::string const& string,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto s = new string_scalar(string, true, stream, mr);
   return std::unique_ptr<scalar>(s);
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 26df76850f7..639097abe63 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -87,7 +88,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::strip(input, side, to_strip, stream, mr);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index a8603f47226..16b22d0de4c 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -51,13 +52,14 @@ struct translate_fn {
   column_device_view const d_strings;
   rmm::device_uvector<translate_table>::iterator table_begin;
   rmm::device_uvector<translate_table>::iterator table_end;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     string_view const d_str = d_strings.element<string_view>(idx);
@@ -79,7 +81,7 @@ struct translate_fn {
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -89,7 +91,7 @@ struct translate_fn {
 std::unique_ptr<column> translate(strings_column_view const& strings,
                                   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -127,7 +129,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 std::unique_ptr<column> translate(strings_column_view const& input,
                                   std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::translate(input, chars_table, stream, mr);
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 0a7353821b0..18e726a6d7d 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -22,15 +22,20 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cstdlib>
+#include <string>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -41,7 +46,7 @@ namespace detail {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -65,6 +70,27 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   return strings_vector;
 }
 
+/**
+ * @copydoc cudf::strings::detail::create_offsets_child_column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::device_async_resource_ref mr)
+{
+  auto const threshold = get_offset64_threshold();
+  if (!is_large_strings_enabled()) {
+    CUDF_EXPECTS(
+      chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
+  }
+  return make_numeric_column(
+    chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
+    count,
+    mask_state::UNALLOCATED,
+    stream,
+    mr);
+}
+
 namespace {
 // The device variables are created here to avoid using a singleton that may cause issues
 // with RMM initialize/finalize. See PR #3159 for details on this approach.
@@ -123,13 +149,19 @@ special_case_mapping const* get_special_case_mapping_table()
 
 int64_t get_offset64_threshold()
 {
-  auto const threshold  = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
-  std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
+  auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+  int64_t const rtn    = threshold != nullptr ? std::atol(threshold) : 0L;
   return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
            ? rtn
            : std::numeric_limits<int32_t>::max();
 }
 
+bool is_large_strings_enabled()
+{
+  auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
+  return env != nullptr && std::string(env) == "1";
+}
+
 int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream)
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 0b3b6e78f82..dff1891c3cc 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -95,7 +96,7 @@ template <typename device_execute_functor>
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
@@ -139,7 +140,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index e010ad9dd41..2ccf071711a 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <memory>
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // get ordered children
   auto ordered_children = extract_ordered_struct_children(columns, stream);
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
index 410a7d9348e..a6ccea5fca1 100644
--- a/cpp/src/structs/scan/scan_inclusive.cu
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -40,7 +41,7 @@ namespace {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Create a gather map containing indices of the prefix min/max elements.
   auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
@@ -78,11 +79,11 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index d94a33ce9fb..bbe2bb96fde 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -33,7 +34,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(null_count <= 0 || !null_mask.is_empty(),
                "Struct column with nulls must be nullable.");
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index f47d066852c..81806c92e23 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -28,6 +28,8 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -92,7 +94,7 @@ struct table_flattener {
   std::vector<null_order> const& null_precedence;
   column_nullability nullability;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   temporary_nullable_data nullable_data;
   std::vector<std::unique_ptr<column>> validity_as_column;
@@ -105,7 +107,7 @@ struct table_flattener {
                   std::vector<null_order> const& null_precedence,
                   column_nullability nullability,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : column_order{column_order},
       null_precedence{null_precedence},
       nullability{nullability},
@@ -202,7 +204,7 @@ std::unique_ptr<flattened_table> flatten_nested_columns(
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
   if (not has_struct) {
@@ -228,7 +230,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
                                                       size_type null_count,
                                                       std::unique_ptr<column>&& input,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   if (input->type().id() == cudf::type_id::EMPTY) {
@@ -280,7 +282,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
  * @copydoc cudf::structs::detail::push_down_nulls
  */
 std::pair<column_view, temporary_nullable_data> push_down_nulls_no_sanitize(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto ret_nullable_data = temporary_nullable_data{};
   if (input.type().id() != type_id::STRUCT) {
@@ -371,7 +373,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
                                           size_type null_count,
                                           std::unique_ptr<column>&& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   input = superimpose_nulls_no_sanitize(null_mask, null_count, std::move(input), stream, mr);
 
@@ -389,7 +391,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
 
 std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view const& input,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   auto output = push_down_nulls_no_sanitize(input, stream, mr);
 
@@ -410,7 +412,7 @@ std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view cons
 
 std::pair<table_view, temporary_nullable_data> push_down_nulls(table_view const& table,
                                                                rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+                                                               rmm::device_async_resource_ref mr)
 {
   auto processed_columns = std::vector<column_view>{};
   auto nullable_data     = temporary_nullable_data{};
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 71b437cb47d..13c31e8ae4c 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -414,7 +415,7 @@ auto replace_child(column_view const& input,
                    column_view const& new_child,
                    std::vector<std::unique_ptr<column>>& out_cols,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   auto const make_output = [&input](auto const& offsets_cv, auto const& child_cv) {
     return column_view{data_type{type_id::LIST},
@@ -463,7 +464,7 @@ auto replace_child(column_view const& input,
 auto compute_ranks(column_view const& input,
                    null_order column_null_order,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   return cudf::detail::rank(input,
                             rank_method::DENSE,
@@ -496,7 +497,7 @@ std::pair<column_view, std::vector<std::unique_ptr<column>>> transform_lists_of_
   column_view const& input,
   null_order column_null_order,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols;
 
@@ -563,7 +564,7 @@ transform_lists_of_structs(column_view const& lhs,
                            column_view const& rhs,
                            null_order column_null_order,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols_lhs;
   std::vector<std::unique_ptr<column>> out_cols_rhs;
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 7e9ed4270c7..9dac7be5efe 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,12 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
 // Copy the columns from another table
-table::table(table const& other, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{other.num_rows()}
 {
   CUDF_FUNC_RANGE();
@@ -51,7 +52,7 @@ table::table(std::vector<std::unique_ptr<column>>&& columns) : _columns{std::mov
 }
 
 // Copy the contents of a `table_view`
-table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{view.num_rows()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index bcbf2d44139..13832b0d9dc 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,30 +145,21 @@ bool has_nested_nullable_columns(table_view const& input)
   });
 }
 
-bool have_same_types(table_view const& lhs, table_view const& rhs)
+namespace detail {
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
 {
   return std::equal(lhs.begin(),
                     lhs.end(),
                     rhs.begin(),
                     rhs.end(),
                     [](column_view const& lcol, column_view const& rcol) {
-                      return cudf::column_types_equal(lcol, rcol);
+                      return cudf::is_relationally_comparable(lcol.type()) and
+                             cudf::have_same_types(lcol, rcol);
                     });
 }
 
-namespace detail {
-
-template <typename TableView>
-bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
-{
-  return std::all_of(thrust::counting_iterator<size_type>(0),
-                     thrust::counting_iterator<size_type>(lhs.num_columns()),
-                     [lhs, rhs](auto const i) {
-                       return lhs.column(i).type() == rhs.column(i).type() and
-                              cudf::is_relationally_comparable(lhs.column(i).type());
-                     });
-}
-
 // Explicit template instantiation for a table of immutable views
 template bool is_relationally_comparable<table_view>(table_view const& lhs, table_view const& rhs);
 
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 363e15d74c1..e196eee275f 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -341,7 +342,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  bpe_merge_pairs const& merge_pairs,
                                                  cudf::string_scalar const& separator,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty() || input.chars_size(stream) == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
@@ -458,7 +459,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
                                                  bpe_merge_pairs const& merges_table,
                                                  cudf::string_scalar const& separator,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 1658f20182b..f34c5c4f7f6 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -99,7 +100,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
   auto content = pairs->release();
@@ -110,7 +111,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
   CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
@@ -121,7 +122,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view cons
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_merge_pairs(merge_pairs, stream, mr);
@@ -142,14 +143,14 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
 
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource*)
+                                 rmm::device_async_resource_ref)
   : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
   : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index b9964352c74..6635b61093e 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -47,12 +48,13 @@ namespace {
  * the same row. The `d_separator` is appended between each token.
  */
 struct detokenizer_fn {
-  cudf::column_device_view const d_strings;  // these are the tokens
-  cudf::size_type const* d_row_map;          // indices sorted by output row
-  cudf::size_type const* d_token_offsets;    // to each input token array
-  cudf::string_view const d_separator;       // append after each token
-  cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
-  char* d_chars{};                           // output buffer for characters
+  cudf::column_device_view const d_strings;    // these are the tokens
+  cudf::size_type const* d_row_map;            // indices sorted by output row
+  cudf::size_type const* d_token_offsets;      // to each input token array
+  cudf::string_view const d_separator;         // append after each token
+  cudf::size_type* d_sizes{};                  // output sizes
+  char* d_chars{};                             // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;  // for addressing output row data in d_chars
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -74,7 +76,7 @@ struct detokenizer_fn {
         nbytes += d_separator.size_bytes();
       }
     }
-    if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
   }
 };
 
@@ -132,7 +134,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   CUDF_EXPECTS(row_indices.size() == strings.size(),
@@ -173,7 +175,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::detokenize(input, row_indices, separator, stream, mr);
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 606bebe2174..8d857175407 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -138,7 +139,7 @@ struct edit_distance_matrix_levenshtein_algorithm {
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& strings,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const strings_count = strings.size();
   if (strings_count == 0) {
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& strings,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   cudf::size_type strings_count = strings.size();
   if (strings_count == 0) {
@@ -301,7 +302,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance(input, targets, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance_matrix(input, stream, mr);
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 3290b58101d..724f3603f29 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -34,12 +34,15 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_scan.h>
 
+#include <stdexcept>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -54,8 +57,9 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Build ngram for each string.
@@ -78,7 +82,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -88,11 +92,14 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
+  CUDF_EXPECTS(
+    separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
   cudf::string_view const d_separator(separator.data(), separator.size());
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams > 1,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto strings_count = strings.size();
   if (strings_count == 0)  // if no strings, return an empty column
@@ -149,7 +156,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
@@ -169,8 +176,9 @@ struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::size_type const* d_ngram_offsets{};
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -180,8 +188,8 @@ struct character_ngram_generator_fn {
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto d_sizes            = d_offsets + ngram_offset;
-    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
+    auto d_output_sizes     = d_sizes + ngram_offset;
+    auto out_ptr            = d_chars ? d_chars + d_offsets[ngram_offset] : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
@@ -189,54 +197,52 @@ struct character_ngram_generator_fn {
         out_ptr =
           cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
       } else {
-        *d_sizes++ = end - begin;
+        *d_output_sizes++ = end - begin;
       }
     }
   }
 };
 }  // namespace
 
-std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
-  auto const strings_count = strings.size();
-  if (strings_count == 0)  // if no strings, return an empty column
+  auto const strings_count = input.size();
+  if (strings_count == 0) {  // if no strings, return an empty column
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
 
-  auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  auto const d_strings      = *strings_column;
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  // create a vector of ngram offsets for each string
-  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
-  thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
-    ngram_offsets.begin(),
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings, strings_count, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
+      [d_strings = *d_strings, ngrams] __device__(auto idx) {
+        if (d_strings.is_null(idx)) { return 0; }
         auto const length = d_strings.element<cudf::string_view>(idx).length();
         return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }),
-    cudf::size_type{0},
-    thrust::plus<cudf::size_type>());
-
-  // total ngrams count is the last entry
-  cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
+      }));
+  auto [offsets, total_ngrams] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::size_type>();
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
-  character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()};
+  character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
-  return cudf::make_strings_column(
+  auto output = cudf::make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
+
+  return make_lists_column(
+    input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr);
 }
 
 namespace {
@@ -275,9 +281,11 @@ struct character_ngram_hash_fn {
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
@@ -320,7 +328,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_character_ngrams(strings, ngrams, stream, mr);
@@ -329,7 +337,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hash_character_ngrams(strings, ngrams, stream, mr);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 612eb52af01..9cf934165f6 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -247,7 +248,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     input1.size() == input2.size(), "input columns must be the same size", std::invalid_argument);
@@ -297,7 +298,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::jaccard_index(input1, input2, width, stream, mr);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 8d22c784584..4318123627d 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/execution_policy.h>
@@ -127,7 +128,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          cudf::device_span<hash_value_type const> seeds,
                                          cudf::size_type width,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
@@ -162,7 +163,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // build the offsets for the output lists column
   auto const zero = cudf::numeric_scalar<cudf::size_type>(0, true, stream);
@@ -190,7 +191,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> const& seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -214,7 +215,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
@@ -227,7 +228,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seed, width, stream, mr);
@@ -249,7 +250,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seeds, width, stream, mr);
@@ -259,7 +260,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seed, width, stream, mr);
@@ -269,7 +270,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seeds, width, stream, mr);
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 75ad542548b..95dd8ff3d6c 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index c06a24382ed..4db11dc5beb 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/normalize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -58,13 +59,14 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  cudf::size_type* d_offsets{};              // offsets into d_chars
+  cudf::size_type* d_sizes{};                // size of each output row
   char* d_chars{};                           // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     cudf::string_view const single_space(" ", 1);
@@ -92,7 +94,7 @@ struct normalize_spaces_fn {
       nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
-    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? nbytes - 1 : 0; }
   }
 };
 
@@ -108,8 +110,9 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int64_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  cudf::size_type* d_offsets{};              // offsets for the output strings
+  cudf::size_type* d_sizes{};                // size of output string
   char* d_chars{};                           // buffer for the output strings column
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return the number of bytes for the output string given its code-point array.
@@ -132,14 +135,14 @@ struct codepoint_to_utf8_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
     if (!d_chars) {
-      d_offsets[idx] = compute_output_size(str_cps, count);
+      d_sizes[idx] = compute_output_size(str_cps, count);
       return;
     }
     // convert each code-point to 1-4 UTF-8 encoded bytes
@@ -174,7 +177,7 @@ struct codepoint_to_utf8_fn {
 // detail API
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -198,7 +201,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -240,7 +243,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_spaces(input, stream, mr);
@@ -252,7 +255,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_characters(input, do_lower_case, stream, mr);
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 5aed701c037..84ed1827117 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -32,6 +32,7 @@
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -47,8 +48,9 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
+  cudf::size_type* d_sizes{};                ///< for output string size
   char* d_chars{};                           ///< output buffer
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Tokenizes each string and calls the provided `replacer` function
@@ -62,7 +64,7 @@ struct base_token_replacer_fn {
   __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -94,10 +96,11 @@ struct base_token_replacer_fn {
     }
 
     // copy the remainder of the string's bytes to the output buffer
-    if (out_ptr)
+    if (out_ptr) {
       memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = nbytes;
+    } else {
+      d_sizes[idx] = nbytes;
+    }
   }
 };
 
@@ -202,7 +205,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls");
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have nulls");
@@ -244,7 +247,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
@@ -281,7 +284,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& in
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
@@ -292,7 +295,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& inp
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 5c67b2e5f54..4746b6b74b9 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -99,7 +100,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         PositionIterator position_itr,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
 
@@ -133,7 +134,7 @@ struct dispatch_is_letter_fn {
                                            letter_type ltype,
                                            cudf::column_view const& indices,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(strings.size() == indices.size(),
                  "strings column and indices column must be the same size");
@@ -211,7 +212,7 @@ struct porter_stemmer_measure_fn {
 
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& strings,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) {
     return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
@@ -240,7 +241,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     indices.type(), dispatch_is_letter_fn{}, strings, ltype, indices, stream, mr);
@@ -254,7 +255,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::size_type character_index,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(input, ltype, indices, stream, mr);
@@ -276,7 +277,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::porter_stemmer_measure(input, stream, mr);
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 0b4f9f729c3..a08fdea3e84 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 
@@ -180,7 +181,7 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no)
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   hashed_vocabulary result;
   std::ifstream hash_file(filename_hashed_vocabulary);
@@ -288,7 +289,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 }  // namespace detail
 
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
-  std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr)
+  std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index a623450ecad..e05427eb6ac 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -139,7 +140,7 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
 tokenizer_result build_empty_result(cudf::size_type size,
                                     uint32_t max_sequence_length,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
   auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
@@ -166,7 +167,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   bool do_lower_case,
                                   bool do_truncate,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
@@ -292,7 +293,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::subword_tokenize(strings,
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 82c51e72b31..0b16305a81a 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -48,7 +49,7 @@ template <typename TokenCounter>
 std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              TokenCounter tokenizer,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // create output column
   auto token_counts =
@@ -72,7 +73,7 @@ template <typename Tokenizer>
 std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                                           Tokenizer tokenizer,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   // get the number of tokens in each string
   auto const token_counts =
@@ -106,7 +107,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -118,7 +119,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -150,7 +151,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -168,7 +169,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 // tokenize on every character
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings_column,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings_column.size();
   if (strings_count == 0) {
@@ -230,7 +231,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiter, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiters, stream, mr);
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiter, stream, mr);
@@ -257,7 +258,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiters, stream, mr);
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::character_tokenize(input, stream, mr);
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index c99adda3fad..8913ce22da8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
@@ -134,7 +135,7 @@ struct key_pair {
 
 tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty");
   CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls");
@@ -165,7 +166,7 @@ tokenize_vocabulary::~tokenize_vocabulary() { delete _impl; }
 
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::make_unique<tokenize_vocabulary>(input, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -467,7 +468,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize_with_vocabulary(input, vocabulary, delimiter, default_id, stream, mr);
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index e558b51fbb0..c12f65deb46 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,11 +27,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
@@ -58,7 +59,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index eaf47adec10..7960731f3a1 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -83,7 +84,7 @@ __launch_bounds__(max_block_size) CUDF_KERNEL
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // If evaluating the expression may produce null outputs we create a nullable
   // output column and follow the null-supporting expression evaluation code
@@ -137,7 +138,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index c0e0c83c416..7a044b9f6f7 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <numeric>
@@ -40,8 +41,9 @@
 namespace cudf {
 namespace detail {
 
-std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
-  table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view const& input_table,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_cols = input_table.num_columns();
 
@@ -70,7 +72,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+  cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index 73c1a83cfe1..adf5db02d9c 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const length = end_bit - begin_bit;
   CUDF_EXPECTS(length >= 0, "begin_bit should be less than or equal to end_bit");
@@ -61,7 +62,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index 3c02409f778..fd4f33c594c 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -37,7 +38,7 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto input_device_view_ptr = column_device_view::create(input, stream);
     auto input_device_view     = *input_device_view_ptr;
@@ -75,14 +76,14 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Input column can't be a non-floating type");
   }
 };
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
@@ -92,7 +93,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 72f864346a4..723c306da1d 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,9 +24,11 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -58,9 +60,11 @@ struct ohe_equality_functor {
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
+  CUDF_EXPECTS(cudf::have_same_types(input, categories),
+               "Mismatch type between input and categories.",
+               cudf::data_type_error);
 
   if (categories.is_empty()) { return {make_empty_column(type_id::BOOL8), table_view{}}; }
 
@@ -110,7 +114,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 78bd558501b..bfac7ab586e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -477,7 +478,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // If there is no rows, segment_length will not be checked.
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
@@ -557,7 +558,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
 
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return segmented_row_bit_count(t, 1, stream, mr);
 }
@@ -566,13 +567,13 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::row_bit_count(t, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6f61ed80dd8..072eb73453b 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -28,6 +28,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
@@ -73,7 +74,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type.");
 
@@ -96,7 +97,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 45c2e650095..abde43535be 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -34,7 +35,7 @@ namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 }  // namespace detail
 
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transpose(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 47a0cb393aa..98c412f805d 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -116,8 +117,12 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_fixed_point<_SourceT>() && cudf::is_numeric<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(DeviceT const element)
   {
-    auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
-    return static_cast<TargetT>(fp);
+    auto const fixed_point = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
+    if constexpr (cuda::std::is_floating_point_v<TargetT>) {
+      return convert_fixed_to_floating<TargetT>(fixed_point);
+    } else {
+      return static_cast<TargetT>(fixed_point);
+    }
   }
 
   template <
@@ -126,7 +131,11 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_numeric<_SourceT>() && cudf::is_fixed_point<TargetT>())>* = nullptr>
   __device__ inline DeviceT operator()(SourceT const element)
   {
-    return TargetT{element, scale}.value();
+    if constexpr (cuda::std::is_floating_point_v<SourceT>) {
+      return convert_floating_to_fixed<TargetT>(element, scale).value();
+    } else {
+      return TargetT{element, scale}.value();
+    }
   }
 };
 
@@ -176,7 +185,7 @@ template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> rescale(column_view input,
                                 numeric::scale_type scale,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using RepType = device_storage_type_t<T>;
@@ -221,7 +230,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<is_supported_non_fixed_point_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -248,7 +257,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_numeric<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -278,7 +287,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_numeric<SourceT>() && cudf::is_fixed_point<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -308,7 +317,7 @@ struct dispatch_unary_cast_to {
                              std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (input.type() == type) {
       return std::make_unique<column>(input, stream, mr);  // TODO add test for this
@@ -323,7 +332,7 @@ struct dispatch_unary_cast_to {
                              not std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using namespace numeric;
     using SourceDeviceT = device_storage_type_t<SourceT>;
@@ -366,7 +375,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<not is_supported_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
 
   {
     if (!cudf::is_fixed_width<TargetT>())
@@ -388,7 +397,7 @@ struct dispatch_unary_cast_from {
   template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return type_dispatcher(type, dispatch_unary_cast_to<T>{input}, type, stream, mr);
   }
@@ -404,7 +413,7 @@ struct dispatch_unary_cast_from {
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Unary cast type must be fixed-width.");
 
@@ -416,7 +425,7 @@ std::unique_ptr<column> cast(column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cast(input, type, stream, mr);
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 88922362319..ab17da5f8c4 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -280,7 +281,7 @@ struct fixed_point_abs {
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using Type                     = device_storage_type_t<T>;
   using FixedPointUnaryOpFunctor = FixedPointFunctor<Type>;
@@ -322,7 +323,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
                                            rmm::device_buffer&& null_mask,
                                            size_type null_count,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const size = cudf::distance(begin, end);
 
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
 template <typename T, typename UFN>
 std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -365,7 +366,7 @@ struct MathOpDispatcher {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -379,7 +380,7 @@ struct MathOpDispatcher {
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -396,7 +397,7 @@ struct MathOpDispatcher {
     std::enable_if_t<!std::is_arithmetic_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -418,7 +419,7 @@ struct BitwiseOpDispatcher {
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -432,7 +433,7 @@ struct BitwiseOpDispatcher {
     template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -448,7 +449,7 @@ struct BitwiseOpDispatcher {
             std::enable_if_t<!std::is_integral_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -478,7 +479,7 @@ struct LogicalOpDispatcher {
   template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<bool, UFN>(input.begin<T>(),
                                    input.end<T>(),
@@ -493,7 +494,7 @@ struct LogicalOpDispatcher {
     template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
       auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -516,7 +517,7 @@ struct LogicalOpDispatcher {
             std::enable_if_t<!is_supported<T>() and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
     auto dictionary_col = dictionary_column_view(input);
@@ -545,7 +546,7 @@ struct FixedPointOpDispatcher {
     column_view const& input,
     cudf::unary_operator op,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (op) {
@@ -563,7 +564,7 @@ struct FixedPointOpDispatcher {
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (cudf::is_fixed_point(input.type()))
     return type_dispatcher(input.type(), detail::FixedPointOpDispatcher{}, input, op, stream, mr);
@@ -647,7 +648,7 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unary_operation(input, op, stream, mr);
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 092ad3b6731..08aa8755624 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto input_device_view = column_device_view::create(input, stream);
 
@@ -61,7 +62,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("NAN is not supported in a Non-floating point type column");
   }
@@ -69,7 +70,7 @@ struct nan_dispatcher {
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return element_validity_pair.second and std::isnan(element_validity_pair.first);
@@ -80,7 +81,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return !element_validity_pair.second or !std::isnan(element_validity_pair.first);
@@ -93,7 +94,7 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_nan(input, stream, mr);
@@ -101,7 +102,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_not_nan(input, stream, mr);
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 6bdd65dd42d..a223a090128 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,15 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -40,7 +42,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -57,7 +59,7 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_null(input, stream, mr);
@@ -65,7 +67,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_valid(input, stream, mr);
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index d0003bb6b41..61c41705665 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -34,7 +35,7 @@ struct launcher {
   static std::unique_ptr<cudf::column> launch(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     std::unique_ptr<cudf::column> output = [&] {
       if (op == cudf::unary_operator::NOT) {
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d6f5c65593a..dac981fb532 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -28,15 +30,16 @@ namespace {
 
 struct columns_equal_fn {
   template <typename T>
-  bool operator()(column_view const&, column_view const&)
+  bool operator()(column_view const& lhs, column_view const& rhs)
   {
-    return true;
+    return lhs.type() == rhs.type();
   }
 };
 
 template <>
 bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_view const& rhs)
 {
+  if (not cudf::is_dictionary(rhs.type())) { return false; }
   auto const kidx = dictionary_column_view::keys_column_index;
   return lhs.num_children() > 0 and rhs.num_children() > 0
            ? lhs.child(kidx).type() == rhs.child(kidx).type()
@@ -46,33 +49,132 @@ bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_v
 template <>
 bool columns_equal_fn::operator()<list_view>(column_view const& lhs, column_view const& rhs)
 {
+  if (rhs.type().id() != type_id::LIST) { return false; }
   auto const& ci = lists_column_view::child_column_index;
-  return column_types_equal(lhs.child(ci), rhs.child(ci));
+  return have_same_types(lhs.child(ci), rhs.child(ci));
 }
 
 template <>
 bool columns_equal_fn::operator()<struct_view>(column_view const& lhs, column_view const& rhs)
 {
-  return lhs.num_children() == rhs.num_children() and
-         std::all_of(thrust::make_counting_iterator(0),
-                     thrust::make_counting_iterator(lhs.num_children()),
-                     [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); });
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  return std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct column_scalar_equal_fn {
+  template <typename T>
+  bool operator()(column_view const& col, scalar const& slr)
+  {
+    return col.type() == slr.type();
+  }
+};
+
+template <>
+bool column_scalar_equal_fn::operator()<dictionary32>(column_view const& col, scalar const& slr)
+{
+  // It is not possible to have a scalar dictionary, so compare the dictionary
+  // column keys type to the scalar type.
+  auto col_keys = cudf::dictionary_column_view(col).keys();
+  return have_same_types(col_keys, slr);
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<list_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::LIST) { return false; }
+  auto const& ci      = lists_column_view::child_column_index;
+  auto const list_slr = static_cast<list_scalar const*>(&slr);
+  return have_same_types(col.child(ci), list_slr->view());
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<struct_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::STRUCT) { return false; }
+  auto const struct_slr = static_cast<struct_scalar const*>(&slr);
+  auto const slr_tbl    = struct_slr->view();
+  return std::equal(col.child_begin(),
+                    col.child_end(),
+                    slr_tbl.begin(),
+                    slr_tbl.end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct scalars_equal_fn {
+  template <typename T>
+  bool operator()(scalar const& lhs, scalar const& rhs)
+  {
+    return lhs.type() == rhs.type();
+  }
+};
+
+template <>
+bool scalars_equal_fn::operator()<list_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::LIST) { return false; }
+  auto const list_lhs = static_cast<list_scalar const*>(&lhs);
+  auto const list_rhs = static_cast<list_scalar const*>(&rhs);
+  return have_same_types(list_lhs->view(), list_rhs->view());
+}
+
+template <>
+bool scalars_equal_fn::operator()<struct_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  auto const tbl_lhs = static_cast<struct_scalar const*>(&lhs)->view();
+  auto const tbl_rhs = static_cast<struct_scalar const*>(&rhs)->view();
+  return have_same_types(tbl_lhs, tbl_rhs);
 }
 
 };  // namespace
 
 // Implementation note: avoid using double dispatch for this function
 // as it increases code paths to NxN for N types.
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
+bool have_same_types(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type() != rhs.type()) { return false; }
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equal(column_view const& lhs, column_view const& rhs)
+{
+  return have_same_types(lhs, rhs);
+}
+
+bool have_same_types(column_view const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(scalar const& lhs, column_view const& rhs)
+{
+  return have_same_types(rhs, lhs);
+}
+
+bool have_same_types(scalar const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), scalars_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(table_view const& lhs, table_view const& rhs)
+{
+  return std::equal(
+    lhs.begin(),
+    lhs.end(),
+    rhs.begin(),
+    rhs.end(),
+    [](column_view const& lcol, column_view const& rcol) { return have_same_types(lcol, rcol); });
+}
+
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type().id() != rhs.type().id()) { return false; }
-  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+  // Check if the columns have fixed point types. This is the only case where
+  // type equality and equivalence differ.
+  if (cudf::is_fixed_point(lhs.type())) { return lhs.type().id() == rhs.type().id(); }
+  return have_same_types(lhs, rhs);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9dbf278c71d..7db9a06e809 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,7 +24,7 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT STREAM_MODE)
+  set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB)
   set(multi_value)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
@@ -55,8 +55,9 @@ function(ConfigureTest CMAKE_TEST_NAME)
   )
 
   target_link_libraries(
-    ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp
-                               $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_TEST_NAME}
+    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
+            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -267,7 +268,14 @@ ConfigureTest(
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
-  INTEROP_TEST interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/dlpack_test.cpp
+  INTEROP_TEST
+  interop/to_arrow_device_test.cpp
+  interop/to_arrow_test.cpp
+  interop/from_arrow_test.cpp
+  interop/from_arrow_device_test.cpp
+  interop/dlpack_test.cpp
+  EXTRA_LIB
+  nanoarrow
 )
 
 # ##################################################################################################
@@ -286,7 +294,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
@@ -560,6 +568,20 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * large strings test ----------------------------------------------------------------------------
+ConfigureTest(
+  LARGE_STRINGS_TEST
+  large_strings/concatenate_tests.cpp
+  large_strings/case_tests.cpp
+  large_strings/large_strings_fixture.cpp
+  large_strings/merge_tests.cpp
+  large_strings/parquet_tests.cpp
+  large_strings/reshape_tests.cpp
+  GPUS 1
+  PERCENT 100
+)
+
 # ##################################################################################################
 # * json path test --------------------------------------------------------------------------------
 ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
@@ -674,6 +696,7 @@ ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 27865bd062f..06e0d193d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -699,6 +699,40 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
 using BinaryOperationCompiledTest_NullOpsString =
   BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
 TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = std::string;
+  using TypeRhs         = std::string;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullNotEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = typename TestFixture::TypeLhs;
+  using TypeRhs         = typename TestFixture::TypeRhs;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+using BinaryOperationCompiledTest_NullOpsString =
+  BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullNotEquals_Vector_Vector)
 {
   using TypeOut     = bool;
   using TypeLhs     = std::string;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index efebc02bc89..c900c4c558c 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -415,6 +415,14 @@ struct NullEquals {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullNotEquals {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    return !NullEquals<TypeOut, TypeLhs, TypeRhs>()(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct NullMax {
   TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index b06d097647d..afebc91dd73 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -761,6 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 0f7c1053adf..3b7bff69938 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,8 +29,12 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
 
 #include <numeric>
 #include <stdexcept>
@@ -164,37 +168,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
-{
-  // Test large concatenate, causes out of bound device memory errors if kernel
-  // indexing is not int64_t.
-  // 1.5GB bytes, 5k columns
-  constexpr size_t num_strings        = 10000;
-  constexpr size_t string_length      = 150000;
-  constexpr size_t strings_per_column = 2;
-  constexpr size_t num_columns        = num_strings / strings_per_column;
-
-  std::vector<std::string> strings;
-  std::vector<char const*> h_strings;
-  std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
-  std::vector<cudf::column_view> strings_columns;
-
-  std::string s(string_length, 'a');
-  for (size_t i = 0; i < num_strings; ++i)
-    h_strings.push_back(s.data());
-
-  for (size_t i = 0; i < num_columns; ++i)
-    strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
-      h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
-  for (auto& wrapper : strings_column_wrappers)
-    strings_columns.push_back(wrapper);
-
-  auto results = cudf::concatenate(strings_columns);
-
-  cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-}
-
 TEST_F(StringColumnTest, ConcatenateManyColumns)
 {
   std::vector<char const*> h_strings{
@@ -216,6 +189,8 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
   std::vector<cudf::column_view> input_cols;
@@ -402,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -525,7 +500,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
@@ -1255,7 +1230,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1264,7 +1239,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1272,14 +1247,14 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::data_type_error);
   }
 }
 
@@ -1634,7 +1609,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::data_type_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {};
@@ -1679,7 +1654,7 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   cudf::test::fixed_width_column_wrapper<int32_t> integers({10, 30, 20});
   auto dictionary2 = cudf::dictionary::encode(integers);
   std::vector<cudf::column_view> views({dictionary1->view(), dictionary2->view()});
-  EXPECT_THROW(cudf::concatenate(views), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(views), cudf::data_type_error);
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index bcc0ac29b3e..223946ddcee 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -465,7 +465,7 @@ TEST_F(CopyRangeErrorTestFixture, DTypeMismatch)
   auto dict_target = cudf::dictionary::encode(target);
   auto dict_source = cudf::dictionary::encode(source);
   EXPECT_THROW(cudf::copy_range(dict_source->view(), dict_target->view(), 0, 100, 0),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 138e1935363..f31d8d6f79a 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -712,7 +712,7 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
   cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
   cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
 
-  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::data_type_error);
 
   cudf::string_scalar input3{"1"};
   EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::data_type_error);
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 2be3c26af1d..99b86c86997 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -542,11 +542,6 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
     return SCW{{field1, field2, field3}, mask};
   }
 
-  /**
-   * @brief Create a 0-length structs column
-   */
-  SCW zero_length_struct() { return SCW{}; }
-
   /**
    * @brief Concatenate structs columns, allow specifying inputs in `initializer_list`
    */
@@ -653,7 +648,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   cudf::size_type index = 2;
   // For well-formed list column, an empty list still holds the complete structure of
   // a 0-length structs column
-  auto expected_data = this->zero_length_struct();
+  auto expected_data = this->make_test_structs_column({}, {}, {}, no_nulls());
 
   auto s       = cudf::get_element(list_column->view(), index);
   auto typed_s = static_cast<cudf::list_scalar const*>(s.get());
@@ -757,8 +752,8 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
   auto list_column_nested =
     this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 1});
 
-  auto expected_data =
-    this->make_test_lists_column(0, {0}, this->zero_length_struct().release(), {});
+  auto expected_data = this->make_test_lists_column(
+    0, {0}, this->make_test_structs_column({}, {}, {}, no_nulls()).release(), {});
 
   cudf::size_type index = 1;
   auto s                = cudf::get_element(list_column_nested->view(), index);
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index f904696593c..9c2b16df1e1 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <limits>
 #include <memory>
@@ -35,8 +36,8 @@ using TestTypes = cudf::test::Types<int32_t>;
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
@@ -45,8 +46,8 @@ std::unique_ptr<cudf::scalar> make_scalar(
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 1314375f383..46bf5468922 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <vector>
 
@@ -83,7 +84,7 @@ TEST_F(DictionaryAddKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index 13fe3efd0f4..9950a39d630 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ TEST_F(DictionaryRemoveKeysTest, Errors)
   auto const dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> del_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/scatter_test.cpp b/cpp/tests/dictionary/scatter_test.cpp
index 2a2841827d0..2f77f4ee621 100644
--- a/cpp/tests/dictionary/scatter_test.cpp
+++ b/cpp/tests/dictionary/scatter_test.cpp
@@ -141,5 +141,5 @@ TEST_F(DictionaryScatterTest, Error)
   EXPECT_THROW(
     cudf::scatter(
       cudf::table_view{{source->view()}}, scatter_map, cudf::table_view{{target->view()}}),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 600d00ac186..b49b4ce5aa0 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -77,9 +77,9 @@ TEST_F(DictionarySearchTest, Errors)
 {
   cudf::test::dictionary_column_wrapper<int64_t> dictionary({1, 2, 3});
   cudf::numeric_scalar<double> key(7);
-  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
       dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index d0c37493cf8..5c9ec3567fe 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -82,7 +83,7 @@ TEST_F(DictionarySetKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 674d2e0a6ea..46d01ec14ff 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -128,17 +128,7 @@ TEST(DebugAssert, cudf_assert_true)
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts    = parse_cudf_test_opts(argc, argv);
-  auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();
-  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
-    auto resource                      = rmm::mr::get_current_device_resource();
-    auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
-    auto const error_on_invalid_stream = (stream_error_mode == "error");
-    auto const check_default_stream    = (stream_mode == "new_cudf_default");
-    auto adaptor                       = make_stream_checking_resource_adaptor(
-      resource, error_on_invalid_stream, check_default_stream);
-    rmm::mr::set_current_device_resource(&adaptor);
-    return RUN_ALL_TESTS();
-  }
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  auto adaptor        = make_stream_mode_adaptor(cmd_opts);
   return RUN_ALL_TESTS();
 }
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 95a27defa4e..26badefe698 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -359,8 +359,8 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
 
   auto destination_view = cudf::mutable_column_view{destination};
 
-  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::logic_error);
-  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
+  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::data_type_error);
+  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index cf619aace5a..5651a26f192 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,15 +102,15 @@ TEST_F(SequenceTestFixture, MismatchedInputs)
 {
   cudf::numeric_scalar<int> init(0);
   cudf::numeric_scalar<float> step(-5);
-  EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init, step), cudf::data_type_error);
 
   cudf::numeric_scalar<int> init2(0);
   cudf::numeric_scalar<int8_t> step2(-5);
-  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::data_type_error);
 
   cudf::numeric_scalar<float> init3(0);
   cudf::numeric_scalar<double> step3(-5);
-  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::data_type_error);
 }
 
 TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 1c1680fcd6e..73de1fbaa68 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -45,67 +46,71 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{1.234567, scale_type{0}};
-  decimalXX num1{1.234567, scale_type{-1}};
-  decimalXX num2{1.234567, scale_type{-2}};
-  decimalXX num3{1.234567, scale_type{-3}};
-  decimalXX num4{1.234567, scale_type{-4}};
-  decimalXX num5{1.234567, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
-
-  EXPECT_EQ(1, static_cast<double>(num0));
-  EXPECT_EQ(1.2, static_cast<double>(num1));
-  EXPECT_EQ(1.23, static_cast<double>(num2));
-  EXPECT_EQ(1.234, static_cast<double>(num3));
-  EXPECT_EQ(1.2345, static_cast<double>(num4));
-  EXPECT_EQ(1.23456, static_cast<double>(num5));
-  EXPECT_EQ(1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{-1.234567, scale_type{0}};
-  decimalXX num1{-1.234567, scale_type{-1}};
-  decimalXX num2{-1.234567, scale_type{-2}};
-  decimalXX num3{-1.234567, scale_type{-3}};
-  decimalXX num4{-1.234567, scale_type{-4}};
-  decimalXX num5{-1.234567, scale_type{-5}};
-  decimalXX num6{-1.234567, scale_type{-6}};
-
-  EXPECT_EQ(-1, static_cast<double>(num0));
-  EXPECT_EQ(-1.2, static_cast<double>(num1));
-  EXPECT_EQ(-1.23, static_cast<double>(num2));
-  EXPECT_EQ(-1.234, static_cast<double>(num3));
-  EXPECT_EQ(-1.2345, static_cast<double>(num4));
-  EXPECT_EQ(-1.23456, static_cast<double>(num5));
-  EXPECT_EQ(-1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+
+  EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(-1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(-1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX a{1.1, scale_type{-1}};
-  decimalXX b{1.01, scale_type{-2}};
-  decimalXX c{1.001, scale_type{-3}};
-  decimalXX d{1.0001, scale_type{-4}};
-  decimalXX e{1.00001, scale_type{-5}};
-  decimalXX f{1.000001, scale_type{-6}};
-
-  decimalXX x{1.000123, scale_type{-8}};
-  decimalXX y{0.000123, scale_type{-8}};
-
-  EXPECT_EQ(1.1, static_cast<double>(a));
-  EXPECT_EQ(1.01, static_cast<double>(b));
-  EXPECT_EQ(1, static_cast<double>(c));  // intentional (inherited problem from floating point)
-  EXPECT_EQ(1.0001, static_cast<double>(d));
-  EXPECT_EQ(1.00001, static_cast<double>(e));
-  EXPECT_EQ(1, static_cast<double>(f));  // intentional (inherited problem from floating point)
-
-  EXPECT_TRUE(1.000123 - static_cast<double>(x) < std::numeric_limits<double>::epsilon());
-  EXPECT_EQ(0.000123, static_cast<double>(y));
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.1, scale_type(-1));
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.01, scale_type(-2));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.001, scale_type(-3));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.0001, scale_type(-4));
+  auto e = cudf::convert_floating_to_fixed<decimalXX>(1.00001, scale_type(-5));
+  auto f = cudf::convert_floating_to_fixed<decimalXX>(1.000001, scale_type(-6));
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(1.000123, scale_type(-8));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(0.000123, scale_type(-8));
+
+  EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
+  EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
+  EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              f));  // intentional (inherited problem from floating point)
+
+  EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
+              std::numeric_limits<double>::epsilon());
+  EXPECT_EQ(0.000123, cudf::convert_fixed_to_floating<double>(y));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
@@ -118,34 +123,34 @@ TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
   binary_fp num3{10, scale_type{3}};
   binary_fp num4{10, scale_type{4}};
 
-  binary_fp num5{1.24, scale_type{0}};
-  binary_fp num6{1.24, scale_type{-1}};
-  binary_fp num7{1.32, scale_type{-2}};
-  binary_fp num8{1.41, scale_type{-3}};
-  binary_fp num9{1.45, scale_type{-4}};
-
-  EXPECT_EQ(10, static_cast<double>(num0));
-  EXPECT_EQ(10, static_cast<double>(num1));
-  EXPECT_EQ(8, static_cast<double>(num2));
-  EXPECT_EQ(8, static_cast<double>(num3));
-  EXPECT_EQ(0, static_cast<double>(num4));
-
-  EXPECT_EQ(1, static_cast<double>(num5));
-  EXPECT_EQ(1, static_cast<double>(num6));
-  EXPECT_EQ(1.25, static_cast<double>(num7));
-  EXPECT_EQ(1.375, static_cast<double>(num8));
-  EXPECT_EQ(1.4375, static_cast<double>(num9));
+  auto num5 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(0));
+  auto num6 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(-1));
+  auto num7 = cudf::convert_floating_to_fixed<binary_fp>(1.32, scale_type(-2));
+  auto num8 = cudf::convert_floating_to_fixed<binary_fp>(1.41, scale_type(-3));
+  auto num9 = cudf::convert_floating_to_fixed<binary_fp>(1.45, scale_type(-4));
+
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(0, cudf::convert_fixed_to_floating<double>(num4));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num7));
+  EXPECT_EQ(1.375, cudf::convert_fixed_to_floating<double>(num8));
+  EXPECT_EQ(1.4375, cudf::convert_fixed_to_floating<double>(num9));
 }
 
 TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
-  binary_fp num0{1.25, scale_type{-2}};
-  binary_fp num1{2.1, scale_type{-4}};
+  auto num0 = cudf::convert_floating_to_fixed<binary_fp>(1.25, scale_type(-2));
+  auto num1 = cudf::convert_floating_to_fixed<binary_fp>(2.1, scale_type(-4));
 
-  EXPECT_EQ(1.25, static_cast<double>(num0));
-  EXPECT_EQ(2.0625, static_cast<double>(num1));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
@@ -166,7 +171,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
   EXPECT_EQ(TWO / ONE, TWO);
   EXPECT_EQ(SIX / TWO, THREE);
 
-  decimalXX a{1.23, scale_type{-2}};
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
   decimalXX b{0, scale_type{0}};
 
   EXPECT_EQ(a + b, a);
@@ -211,8 +216,8 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
   EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1);
   EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0);
 
-  decimalXX A{34.56, scale_type{-2}};
-  decimalXX B{1.234, scale_type{-3}};
+  auto A = cudf::convert_floating_to_fixed<decimalXX>(34.56, scale_type(-2));
+  auto B = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
   decimalXX C{1, scale_type{-2}};
 
   EXPECT_EQ(static_cast<int32_t>(A / B), 20);
@@ -255,17 +260,17 @@ TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX a{1, scale_type{0}};
-  decimalXX b{1.2, scale_type{-1}};
-  decimalXX c{1.23, scale_type{-2}};
-  decimalXX d{1.111, scale_type{-3}};
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.111, scale_type(-3));
 
-  decimalXX x{2.2, scale_type{-1}};
-  decimalXX y{3.43, scale_type{-2}};
-  decimalXX z{4.541, scale_type{-3}};
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(2.2, scale_type(-1));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(3.43, scale_type(-2));
+  auto z = cudf::convert_floating_to_fixed<decimalXX>(4.541, scale_type(-3));
 
-  decimalXX xx{0.2, scale_type{-1}};
-  decimalXX yy{0.03, scale_type{-2}};
-  decimalXX zz{0.119, scale_type{-3}};
+  auto xx = cudf::convert_floating_to_fixed<decimalXX>(0.2, scale_type(-1));
+  auto yy = cudf::convert_floating_to_fixed<decimalXX>(0.03, scale_type(-2));
+  auto zz = cudf::convert_floating_to_fixed<decimalXX>(0.119, scale_type(-3));
 
   EXPECT_EQ(a + b, x);
   EXPECT_EQ(a + b + c, y);
@@ -280,12 +285,12 @@ TYPED_TEST(FixedPointTestAllReps, RescaledTest)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX num0{1, scale_type{0}};
-  decimalXX num1{1.2, scale_type{-1}};
-  decimalXX num2{1.23, scale_type{-2}};
-  decimalXX num3{1.234, scale_type{-3}};
-  decimalXX num4{1.2345, scale_type{-4}};
-  decimalXX num5{1.23456, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.2345, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.23456, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
 
   EXPECT_EQ(num0, num6.rescaled(scale_type{0}));
   EXPECT_EQ(num1, num6.rescaled(scale_type{-1}));
@@ -314,7 +319,7 @@ TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX truthy_value{1.234567, scale_type{0}};
+  auto truthy_value = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
   decimalXX falsy_value{0, scale_type{0}};
 
   // Test explicit conversions
@@ -442,12 +447,14 @@ void float_vector_test(ValueType const initial_value,
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
-  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
+  auto decimal_input = cudf::convert_floating_to_fixed<decimal32>(initial_value, scale_type{scale});
+  std::iota(std::begin(vec1), std::end(vec1), decimal_input);
   std::iota(std::begin(vec2), std::end(vec2), initial_value);
 
   auto equal = std::equal(
     std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) {
-      return static_cast<double>(a) - b <= std::numeric_limits<ValueType>::epsilon();
+      return cudf::convert_fixed_to_floating<double>(a) - b <=
+             std::numeric_limits<ValueType>::epsilon();
     });
 
   EXPECT_TRUE(equal);
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index de51012e8e1..8bd109fca53 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index d2ecb667eca..1a6abf2e734 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -507,7 +507,7 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
@@ -525,5 +525,5 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 4b10716706b..be2e33538b9 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int16_t, int16_t>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 
diff --git a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
deleted file mode 100644
index e8bbfaa2cba..00000000000
--- a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/hashing.hpp>
-
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
-template <typename T>
-class SparkMurmurHashTestTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestTyped, cudf::test::FixedWidthTypes);
-
-TYPED_TEST(SparkMurmurHashTestTyped, Equality)
-{
-  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
-  auto const input = cudf::table_view({col});
-
-  // Hash of same input should be equal
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input);
-
-  EXPECT_EQ(input.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TYPED_TEST(SparkMurmurHashTestTyped, EqualityNulls)
-{
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-template <typename T>
-class SparkMurmurHashTestFloatTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestFloatTyped, cudf::test::FloatingPointTypes);
-
-TYPED_TEST(SparkMurmurHashTestFloatTyped, TestExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::fixed_width_column_wrapper<T> const col(
-    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
-    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
-    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
-
-  auto const table_col          = cudf::table_view({col});
-  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
-  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
-
-  // Spark hash is sensitive to 0 and -0
-  auto const spark_col         = cudf::hashing::spark_murmurhash3_x86_32(table_col, 0);
-  auto const spark_col_neg_nan = cudf::hashing::spark_murmurhash3_x86_32(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
-}
-
-class SparkMurmurHashTest : public cudf::test::BaseFixture {};
-
-TEST_F(SparkMurmurHashTest, MultiValueNulls)
-{
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"different but null",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "I am Jack's complete lack of null value",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
-
-  // Nulls with different values should be equal
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(-200),
-     static_cast<ts::duration>(200),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-
-  auto const input1        = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
-  auto const input2        = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TEST_F(SparkMurmurHashTest, MultiValueWithSeeds)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark.
-  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
-  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
-  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
-  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
-  // the workaround in the calling code is removed. This also affects the combined hash values.
-
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types._
-  import org.apache.spark.sql.Row
-  import org.apache.spark.sql.catalyst.util.DateTimeUtils
-
-  val schema = new StructType()
-      .add("structs", new StructType()
-          .add("a", IntegerType)
-          .add("b", StringType)
-          .add("c", new StructType()
-              .add("x", FloatType)
-              .add("y", LongType)))
-      .add("strings", StringType)
-      .add("doubles", DoubleType)
-      .add("timestamps", TimestampType)
-      .add("decimal64", DecimalType(18, 7))
-      .add("longs", LongType)
-      .add("floats", FloatType)
-      .add("dates", DateType)
-      .add("decimal32", DecimalType(9, 3))
-      .add("ints", IntegerType)
-      .add("shorts", ShortType)
-      .add("bytes", ByteType)
-      .add("bools", BooleanType)
-      .add("decimal128", DecimalType(38, 11))
-
-  val data = Seq(
-      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
-          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
-          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-          false, BigDecimal(0)),
-      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
-          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
-          true, BigDecimal("0.000000001")),
-      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
-          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
-          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-          true, BigDecimal("-0.00000000001")),
-      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
-          "All work and no play makes Jack a dull boy", Double.MinValue,
-          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
-          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
-          BigDecimal("-9999999999999999.99999999999")),
-      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
-          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
-          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
-          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
-          BigDecimal("99999999999999999999999999.99999999999")))
-
-  val df = spark.createDataFrame(sc.parallelize(data), schema)
-  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
-  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
-  */
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
-    {-105406170, 90479889, -678041645, 1667387937, 301478567});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
-    {142593372, 1217302703, -715697185, -2061143941, -111635966});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
-    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
-    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
-    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
-    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
-    {933211791, 723455942, -349261430, -1225560532, -338752985});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
-    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
-    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
-    {933211791, 751823303, -1080202046, 723455942, 133916647});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
-    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
-    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
-    {933211791, -559580957, -559580957, -559580957, 933211791});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
-    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {401603227, 588162166, 552160517, 1132537411, -326043017});
-
-  using double_limits = std::numeric_limits<double>;
-  using long_limits   = std::numeric_limits<int64_t>;
-  using float_limits  = std::numeric_limits<float>;
-  using int_limits    = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
-  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
-  cudf::test::fixed_width_column_wrapper<float> x_col{
-    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
-  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
-    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
-  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
-  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
-    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
-    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
-  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
-    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
-  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
-    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
-  cudf::test::fixed_width_column_wrapper<float> const floats_col(
-    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
-    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
-  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
-    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, int_limits::min(), int_limits::max()});
-  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
-  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
-    {static_cast<__int128>(0),
-     static_cast<__int128>(100),
-     static_cast<__int128>(-1),
-     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
-     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
-    numeric::scale_type{-11});
-
-  auto const hash_structs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({structs_col}), 42);
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 42);
-  auto const hash_doubles =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({doubles_col}), 42);
-  auto const hash_timestamps =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({timestamps_col}), 42);
-  auto const hash_decimal64 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal64_col}), 42);
-  auto const hash_longs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({longs_col}), 42);
-  auto const hash_floats =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({floats_col}), 42);
-  auto const hash_dates =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({dates_col}), 42);
-  auto const hash_decimal32 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal32_col}), 42);
-  auto const hash_ints = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({ints_col}), 42);
-  auto const hash_shorts =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({shorts_col}), 42);
-  auto const hash_bytes =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bytes_col}), 42);
-  auto const hash_bools1 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col1}), 42);
-  auto const hash_bools2 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col2}), 42);
-  auto const hash_decimal128 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal128_col}), 42);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
-
-  auto const combined_table = cudf::table_view({structs_col,
-                                                strings_col,
-                                                doubles_col,
-                                                timestamps_col,
-                                                decimal64_col,
-                                                longs_col,
-                                                floats_col,
-                                                dates_col,
-                                                decimal32_col,
-                                                ints_col,
-                                                shorts_col,
-                                                bytes_col,
-                                                bools_col2,
-                                                decimal128_col});
-  auto const hash_combined  = cudf::hashing::spark_murmurhash3_x86_32(combined_table, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StringsWithSeed)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark:
-  // val strs = Seq("", "The quick brown fox",
-  //              "jumps over the lazy dog.",
-  //              "All work and no play makes Jack a dull boy",
-  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
-  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
-  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
-  //     _, org.apache.spark.sql.types.StringType, 314)))
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
-    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 314);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists",ArrayType(ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(null),
-    Row(List(null)),
-    Row(List(List())),
-    Row(List(List(1))),
-    Row(List(List(1, 2))),
-    Row(List(List(1, 2, 3))),
-    Row(List(List(1, 2), List(3))),
-    Row(List(List(1), List(2, 3))),
-    Row(List(List(1), List(null, 2, 3))),
-    Row(List(List(1, 2), List(3), List(null))),
-    Row(List(List(1, 2), null, List(3))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto nested_list =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {1},
-                                           {1, 2},
-                                           {1, 2, 3},
-                                           {1, 2},
-                                           {3},
-                                           {1},
-                                           {2, 3},
-                                           {1},
-                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {3},
-                                           {{null}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {},
-                                           {3}},
-                                          cudf::test::iterators::nulls_at({0, 14}));
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
-  auto list_validity = cudf::test::iterators::nulls_at({0});
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
-  auto list_column = cudf::make_lists_column(
-    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
-                                                                42,
-                                                                42,
-                                                                -559580957,
-                                                                -222940379,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StructOfListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("structs", new StructType()
-        .add("a", ArrayType(IntegerType))
-        .add("b", ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(Row(List(), List())),
-    Row(Row(List(0), List(0))),
-    Row(Row(List(1, null), null)),
-    Row(Row(List(1, null), List())),
-    Row(Row(List(), List(null, 1))),
-    Row(Row(null, List(1))),
-    Row(Row(List(2, 3), List(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1 =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {0},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {},
-                                           {} /*NULL*/,
-                                           {2, 3}},
-                                          cudf::test::iterators::nulls_at({5}));
-  auto col2 = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
-    cudf::test::iterators::nulls_at({2}));
-  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({struct_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListOfStructValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists", ArrayType(new StructType()
-      .add("a", IntegerType)
-      .add("b", IntegerType)))
-
-  val data = Seq(
-    Row(List(Row(0, 0))),
-    Row(List(null)),
-    Row(List(Row(null, null))),
-    Row(List(Row(1, null))),
-    Row(List(Row(null, 1))),
-    Row(List(Row(null, 1), Row(2, 3))),
-    Row(List(Row(2, 3), null)),
-    Row(List(Row(2, 3), Row(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
-    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
-  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
-  auto struct_column =
-    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
-  auto list_nullmask = std::vector<bool>(1, 8);
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
-
-  // TODO: Lists of structs are not yet supported. Once support is added,
-  // remove this EXPECT_THROW and uncomment the rest of this test.
-  EXPECT_THROW(cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42),
-               cudf::logic_error);
-
-  /*
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-  */
-}
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 895887ee348..ecc8558243d 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/interop.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -98,7 +99,7 @@ TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
   cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
   cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
   cudf::table_view input({col1, col2});
-  EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
+  EXPECT_THROW(cudf::to_dlpack(input), cudf::data_type_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
new file mode 100644
index 00000000000..66bd4dd1bfb
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct FromArrowDeviceTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct FromArrowDeviceTestDurationsTest : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(FromArrowDeviceTest, FailConditions)
+{
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  ArrowSchema schema;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  ArrowDeviceArray arr;
+  arr.device_type = ARROW_DEVICE_CPU;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error);
+
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error);
+}
+
+TEST_F(FromArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto expected_cudf_table = table->view();
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, arr.get(), sizeof(ArrowArray));
+  input.device_id   = rmm::get_current_cuda_device().value();
+  input.device_type = ARROW_DEVICE_CUDA;
+  input.sync_event  = nullptr;
+
+  auto got_cudf_table = cudf::from_arrow_device(schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, *got_cudf_table);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, DateTimeTable)
+{
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col  = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
+
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length                  = 6;
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = 6;
+  input_array->children[0]->null_count = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data =
+    const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes =
+    sizeof(int64_t) * cudf::column_view(col).size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view expected_table_view({col});
+  const ArrowTimeUnit time_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+  auto data_ptr  = expected_table_view.column(0).data<uint8_t>();
+  auto data_size = expected_table_view.column(0).size();
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length                  = expected_table_view.num_rows();
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = expected_table_view.num_rows();
+  input_array->children[0]->null_count = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data       = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes = sizeof(T) * data_size;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
+  input_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
+  input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  nanoarrow::UniqueArray input_array;
+  EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length = expected_table_view.num_rows();
+  auto top_list       = input_array->children[0];
+  cudf::lists_column_view lview{expected_table_view.column(0)};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, StructColumn)
+{
+  using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view expected_table_view({struct_col->view()});
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  input_schema->children[0]->flags = 0;
+
+  auto child = input_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+
+  input_array->length = expected_table_view.num_rows();
+
+  auto array_a        = input_array->children[0];
+  auto view_a         = expected_table_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
+{
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+
+  cudf::table expected_table(std::move(columns));
+  cudf::table_view expected_table_view = expected_table.view();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
+
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length     = expected_table.num_rows();
+  input_array->null_count = 0;
+
+  auto col1_indices =
+    cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int8_t>(input_array->children[0], col1_indices);
+  populate_from_col<int64_t>(input_array->children[0]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(0)}.keys());
+
+  auto col2_indices =
+    cudf::test::fixed_width_column_wrapper<int16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int16_t>(input_array->children[1], col2_indices);
+  populate_from_col<int64_t>(input_array->children[1]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(1)}.keys());
+
+  auto col3_indices =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int64_t>(input_array->children[2], col3_indices);
+  populate_from_col<int64_t>(input_array->children[2]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+}
+
+void slice_nanoarrow(ArrowArray* arr, int64_t start, int64_t end)
+{
+  auto op = [&](ArrowArray* array) {
+    array->offset = start;
+    array->length = end - start;
+    if (array->null_count != 0) {
+      array->null_count =
+        cudf::null_count(reinterpret_cast<cudf::bitmask_type const*>(array->buffers[0]),
+                         start,
+                         end,
+                         cudf::get_default_stream());
+    }
+  };
+
+  if (arr->n_children == 0) {
+    op(arr);
+    return;
+  }
+
+  arr->length = end - start;
+  for (int64_t i = 0; i < arr->n_children; ++i) {
+    op(arr->children[i]);
+  }
+}
+
+struct FromArrowDeviceTestSlice
+  : public FromArrowDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(FromArrowDeviceTestSlice, SliceTest)
+{
+  auto [table, schema, array] = get_nanoarrow_tables(10000);
+  auto cudf_table_view        = table->view();
+  auto const [start, end]     = GetParam();
+
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  slice_nanoarrow(array.get(), start, end);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(schema.get(), &input_device_array);
+  if (got_cudf_table_view->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*got_cudf_table_view, from_struct);
+
+  } else {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(FromArrowDeviceTest,
+                        FromArrowDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(2912, 2915),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000),
+                                          std::make_tuple(10000, 10000)));
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(FromArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6};
+    auto const col      = fp_wrapper<__int128_t>(data.cbegin(), data.cend(), scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+    ArrowSchemaInit(input_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+    nanoarrow::UniqueArray input_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota           = thrust::make_counting_iterator(1);
+    auto const data     = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col      = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+    ArrowSchemaInit(input_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+    nanoarrow::UniqueArray input_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+    ArrowSchemaInit(input_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+    nanoarrow::UniqueArray input_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % 2 ? 0 : 1; };
+    auto validity    = cudf::detail::make_counting_transform_iterator(0, every_other);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const data  = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, validity, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+    ArrowSchemaInit(input_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+    nanoarrow::UniqueArray input_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
new file mode 100644
index 00000000000..fb5d1060f6f
--- /dev/null
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <nanoarrow/nanoarrow.hpp>
+
+// no-op allocator/deallocator to set into ArrowArray buffers that we don't
+// want to own their buffers.
+static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
+  .reallocate = [](ArrowBufferAllocator*, uint8_t* ptr, int64_t, int64_t) -> uint8_t* {
+    return ptr;
+  },
+  .free         = [](ArrowBufferAllocator*, uint8_t*, int64_t) {},
+  .private_data = nullptr,
+};
+
+// populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view
+// and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(T) * view.size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
+}
+
+// populate an ArrowArray from a boolean cudf column. Since Arrow and cudf
+// still represent boolean arrays differently, we have to use bools_to_mask
+// and give the ArrowArray object ownership of the device data.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* arr,
+                                                                  cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  auto bitmask = cudf::bools_to_mask(view);
+  auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+    ArrowArrayBuffer(arr, 1),
+    ArrowBufferDeallocator(
+      [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+        auto buf = reinterpret_cast<std::unique_ptr<rmm::device_buffer>*>(alloc->private_data);
+        delete buf;
+      },
+      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first)))));
+  ArrowArrayBuffer(arr, 1)->size_bytes = cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayBuffer(arr, 1)->data       = ptr;
+}
+
+// populate an ArrowArray with the string data buffers of a cudf column_view
+// using no-op allocator so the ArrowArray knows it doesn't have ownership
+// of the device buffers.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  cudf::strings_column_view sview{view};
+  if (view.size() > 0) {
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+    ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * sview.offsets().size();
+    ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
+    ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
+    ArrowArrayBuffer(arr, 2)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
+  } else {
+    auto zero          = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
+    const uint8_t* ptr = reinterpret_cast<uint8_t*>(zero.data());
+    nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
+  }
+}
+
+template <typename KEY_TYPE, typename IND_TYPE>
+void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
+{
+  arr->length     = dview.size();
+  arr->null_count = dview.null_count();
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(dview.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
+
+  populate_from_col<KEY_TYPE>(arr->dictionary, dview.keys());
+}
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length = 10000);
+
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
new file mode 100644
index 00000000000..626aeb53cdd
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length)
+{
+  std::vector<int64_t> int64_data(length);
+  std::vector<bool> bool_data(length);
+  std::vector<std::string> string_data(length);
+  std::vector<uint8_t> validity(length);
+  std::vector<bool> bool_validity(length);
+  std::vector<uint8_t> bool_data_validity;
+  cudf::size_type length_of_individual_list = 3;
+  cudf::size_type length_of_list            = length_of_individual_list * length;
+  std::vector<int64_t> list_int64_data(length_of_list);
+  std::vector<uint8_t> list_int64_data_validity(length_of_list);
+  std::vector<int32_t> list_offsets(length + 1);
+
+  std::vector<std::unique_ptr<cudf::column>> columns;
+
+  std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; });
+  std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; });
+  auto validity_generator = []() { return rand() % 7 != 0; };
+  std::generate(
+    list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
+  std::generate(
+    list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
+      return (n++) * length_of_individual_list;
+    });
+  std::generate(bool_data.begin(), bool_data.end(), validity_generator);
+  std::generate(
+    string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; });
+  std::generate(validity.begin(), validity.end(), validity_generator);
+  std::generate(bool_validity.begin(), bool_validity.end(), validity_generator);
+
+  std::transform(bool_validity.cbegin(),
+                 bool_validity.cend(),
+                 std::back_inserter(bool_data_validity),
+                 [](auto val) { return static_cast<uint8_t>(val); });
+
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
+                         int64_data.begin(), int64_data.end(), validity.begin())
+                         .release());
+  columns.emplace_back(
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release());
+  auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
+    int64_data.begin(), int64_data.end(), validity.begin());
+  auto dict_col = cudf::dictionary::encode(col4);
+  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
+                         bool_data.begin(), bool_data.end(), bool_validity.begin())
+                         .release());
+  auto list_child_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+    list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin());
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<int32_t>(list_offsets.begin(), list_offsets.end());
+  auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(cudf::make_lists_column(length,
+                                               list_offsets_column.release(),
+                                               list_child_column.release(),
+                                               list_nulls,
+                                               std::move(*list_mask)));
+  auto int_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+                      int64_data.begin(), int64_data.end(), validity.begin())
+                      .release();
+  auto str_column =
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release();
+  vector_of_columns cols;
+  cols.push_back(move(int_column));
+  cols.push_back(move(str_column));
+  auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(
+    cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask)));
+
+  nanoarrow::UniqueSchema schema;
+  ArrowSchemaInit(schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a"));
+  if (columns[0]->null_count() > 0) {
+    schema->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b"));
+  if (columns[1]->null_count() > 0) {
+    schema->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c"));
+  if (columns[2]->null_count() > 0) {
+    schema->children[2]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[2]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d"));
+  if (columns[3]->null_count() > 0) {
+    schema->children[3]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[3]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element"));
+  if (columns[4]->child(1).null_count() > 0) {
+    schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e"));
+  if (columns[4]->has_nulls()) {
+    schema->children[4]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->flags = 0;
+  }
+
+  ArrowSchemaInit(schema->children[5]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral"));
+  if (columns[5]->child(0).has_nulls()) {
+    schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string"));
+  if (columns[5]->child(1).has_nulls()) {
+    schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f"));
+  if (columns[5]->has_nulls()) {
+    schema->children[5]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->flags = 0;
+  }
+
+  nanoarrow::UniqueArray arrow;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
+  arrow->length = length;
+
+  populate_from_col<int64_t>(arrow->children[0], columns[0]->view());
+  populate_from_col<cudf::string_view>(arrow->children[1], columns[1]->view());
+  populate_dict_from_col<int64_t, uint32_t>(arrow->children[2],
+                                            cudf::dictionary_column_view(columns[2]->view()));
+
+  populate_from_col<bool>(arrow->children[3], columns[3]->view());
+  cudf::lists_column_view list_view{columns[4]->view()};
+  populate_list_from_col(arrow->children[4], list_view);
+  populate_from_col<int64_t>(arrow->children[4]->children[0], list_view.child());
+
+  cudf::structs_column_view struct_view{columns[5]->view()};
+  populate_from_col<int64_t>(arrow->children[5]->children[0], struct_view.child(0));
+  populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
+  arrow->children[5]->length     = struct_view.size();
+  arrow->children[5]->null_count = struct_view.null_count();
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc));
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(struct_view.size());
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(struct_view.null_mask()));
+
+  ArrowError error;
+  if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) !=
+      NANOARROW_OK) {
+    std::cerr << ArrowErrorMessage(&error) << std::endl;
+    CUDF_FAIL("failed to build example arrays");
+  }
+
+  return std::make_tuple(
+    std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
+}
+
+// populate an ArrowArray list array from device buffers using a no-op
+// allocator so that the ArrowArray doesn't have ownership of the buffers
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
+}
+
+struct BaseArrowFixture : public cudf::test::BaseFixture {
+  void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
+  {
+    EXPECT_STREQ(expected->format, actual->format);
+    EXPECT_STREQ(expected->name, actual->name);
+    EXPECT_STREQ(expected->metadata, actual->metadata);
+    EXPECT_EQ(expected->flags, actual->flags);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(expected->children[i]->name);
+        compare_schemas(expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_schemas(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+
+  void compare_device_buffers(const size_t nbytes,
+                              const int buffer_idx,
+                              const ArrowArray* expected,
+                              const ArrowArray* actual)
+  {
+    std::vector<uint8_t> actual_bytes;
+    std::vector<uint8_t> expected_bytes;
+    expected_bytes.resize(nbytes);
+    actual_bytes.resize(nbytes);
+
+    // synchronous copies so we don't have to worry about async weirdness
+    cudaMemcpy(
+      expected_bytes.data(), expected->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(actual_bytes.data(), actual->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+
+    ASSERT_EQ(expected_bytes, actual_bytes);
+  }
+
+  void compare_arrays(const ArrowSchema* schema,
+                      const ArrowArray* expected,
+                      const ArrowArray* actual)
+  {
+    ArrowSchemaView schema_view;
+    NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
+
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_buffers, actual->n_buffers);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->length > 0) {
+      EXPECT_EQ(expected->buffers[0], actual->buffers[0]);
+      if (schema_view.type == NANOARROW_TYPE_BOOL) {
+        const size_t nbytes = (expected->length + 7) >> 3;
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else if (schema_view.type == NANOARROW_TYPE_DECIMAL128) {
+        const size_t nbytes = (expected->length * sizeof(__int128_t));
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else {
+        for (int i = 1; i < expected->n_buffers; ++i) {
+          EXPECT_EQ(expected->buffers[i], actual->buffers[i]);
+        }
+      }
+    }
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(schema->children[i]->name);
+        compare_arrays(schema->children[i], expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(schema->dictionary, expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowDeviceTest : public BaseArrowFixture {};
+
+template <typename T>
+struct ToArrowDeviceTestDurationsTest : public BaseArrowFixture {};
+
+TYPED_TEST_SUITE(ToArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto struct_meta          = cudf::column_metadata{"f"};
+  struct_meta.children_meta = {{"integral"}, {"string"}};
+
+  cudf::dictionary_column_view dview{table->view().column(2)};
+
+  std::vector<cudf::column_metadata> meta{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta);
+
+  compare_schemas(schema.get(), got_arrow_schema.get());
+
+  auto got_arrow_device = cudf::to_arrow_device(table->view());
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
+
+  got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
+}
+
+TEST_F(ToArrowDeviceTest, DateTimeTable)
+{
+  auto data = {1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+}
+
+TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+}
+
+TEST_F(ToArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+  expected_array->length = input.num_rows();
+  auto top_list          = expected_array->children[0];
+  cudf::lists_column_view lview{input.get_column(0).view()};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+}
+
+TEST_F(ToArrowDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  std::vector<std::unique_ptr<cudf::column>> table_cols;
+  table_cols.emplace_back(struct_col.release());
+  cudf::table input(std::move(table_cols));
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+
+  expected_array->length = input.num_rows();
+
+  auto array_a        = expected_array->children[0];
+  auto view_a         = input.view().column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto col               = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int64_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int64_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int64_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data)))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
+
+TEST_F(ToArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    auto col               = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 8e3ecd817e4..880dc911954 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
     view.column(1));
 }
 
+TEST_F(CsvReaderTest, StringsQuotesWhitespace)
+{
+  std::vector<std::string> names{"line", "verse"};
+
+  auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << names[0] << ',' << names[1] << '\n';
+    outfile << "A,a" << '\n';              // unquoted no whitespace
+    outfile << "    B,b" << '\n';          // unquoted leading whitespace
+    outfile << "C    ,c" << '\n';          // unquoted trailing whitespace
+    outfile << "    D    ,d" << '\n';      // unquoted leading and trailing whitespace
+    outfile << "\"E\",e" << '\n';          // quoted no whitespace
+    outfile << "\"F\"    ,f" << '\n';      // quoted trailing whitespace
+    outfile << "    \"G\",g" << '\n';      // quoted leading whitespace
+    outfile << "    \"H\"    ,h" << '\n';  // quoted leading and trailing whitespace
+    outfile << "    \"    I    \"    ,i"
+            << '\n';  // quoted leading and trailing whitespace with spaces inside quotes
+  }
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+      .names(names)
+      .dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
+      .quoting(cudf::io::quote_style::ALL)
+      .doublequote(false)
+      .detect_whitespace_around_quotes(true);
+  auto result = cudf::io::read_csv(in_opts);
+
+  auto const view = result.tbl->view();
+  ASSERT_EQ(2, view.num_columns());
+  ASSERT_EQ(type_id::STRING, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
+
+  expect_column_data_equal(
+    std::vector<std::string>{"A", "    B", "C    ", "    D    ", "E", "F", "G", "H", "    I    "},
+    view.column(0));
+  expect_column_data_equal(std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
+                           view.column(1));
+}
+
 TEST_F(CsvReaderTest, SkiprowsNrows)
 {
   auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 8d8fdd2a0e1..7482cb1b70d 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -22,11 +22,21 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/resource_ref.hpp>
+
+#include <fstream>
+#include <string>
+#include <vector>
+
 /**
  * @brief Base test fixture for JSON reader tests
  */
 struct JsonReaderTest : public cudf::test::BaseFixture {};
 
+cudf::test::TempDirTestEnvironment* const temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
 // function to extract first delimiter in the string in each chunk,
 // collate together and form byte_range for each chunk,
 // parse separately.
@@ -35,11 +45,10 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   cudf::io::json_reader_options const& reader_opts,
   int32_t chunk_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::json::detail;
   using cudf::size_type;
-  // assuming single source.
   size_t total_source_size = 0;
   for (auto const& source : sources) {
     total_source_size += source->size();
@@ -75,7 +84,9 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   std::vector<cudf::io::table_with_metadata> tables;
   // Process each chunk in parallel.
   for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1) continue;
+    if (chunk_start == -1 or chunk_end == -1 or
+        static_cast<size_t>(chunk_start) >= total_source_size)
+      continue;
     reader_opts_chunk.set_byte_range_offset(chunk_start);
     reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
     tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
@@ -85,7 +96,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   return tables;
 }
 
-TEST_F(JsonReaderTest, ByteRange)
+TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
@@ -124,3 +135,98 @@ TEST_F(JsonReaderTest, ByteRange)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
 }
+
+TEST_F(JsonReaderTest, ReadCompleteFiles)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  std::vector<cudf::io::table_with_metadata> part_tables;
+  for (auto filepath : filepaths) {
+    cudf::io::json_reader_options part_in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+    part_tables.push_back(cudf::io::read_json(part_in_options));
+  }
+
+  auto part_table_views = std::vector<cudf::table_view>(part_tables.size());
+  std::transform(part_tables.begin(), part_tables.end(), part_table_views.begin(), [](auto& table) {
+    return table.tbl->view();
+  });
+
+  auto expected_result = cudf::concatenate(part_table_views);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view());
+}
+
+TEST_F(JsonReaderTest, ByteRange_MultiSource)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+
+  auto file_paths = json_lines_options.get_source().filepaths();
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& fp : file_paths) {
+    datasources.emplace_back(cudf::io::datasource::create(fp));
+  }
+
+  // Test for different chunk sizes
+  for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
+    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
+                                                           json_lines_options,
+                                                           chunk_size,
+                                                           cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
+}
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 593c8136e6a..5260b435482 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -20,6 +20,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
@@ -39,23 +41,22 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  rmm::device_uvector<char> device_input(
-    host_input.size(), cudf::test::get_default_stream(), rsc.get());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
-                                host_input.data(),
-                                host_input.size(),
-                                cudaMemcpyHostToDevice,
-                                cudf::test::get_default_stream().value()));
+  auto stream_view  = cudf::test::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
+
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
-    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
-  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_fst_output.data(),
+                                device_data.data(),
                                 preprocessed_host_output.size(),
                                 cudaMemcpyDeviceToHost,
-                                cudf::test::get_default_stream().value()));
+                                stream_view.value()))
+  stream_view.synchronize();
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0b70e5e3f93..35e6adf20e7 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -35,12 +36,15 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
 
 #include <fstream>
 #include <limits>
+#include <memory>
 #include <type_traits>
 
 #define wrapper cudf::test::fixed_width_column_wrapper
@@ -165,26 +169,15 @@ struct JsonReaderTest : public cudf::test::BaseFixture {};
  * @brief Enum class to be used to specify the test case of parametrized tests
  */
 enum class json_test_t {
-  // Run test with the existing JSON lines reader using row-orient input data
-  legacy_lines_row_orient,
-  // Run test with the existing JSON lines reader using record-orient input data
-  legacy_lines_record_orient,
   // Run test with the nested JSON lines reader using record-orient input data
-  json_experimental_record_orient,
+  json_record_orient,
   // Run test with the nested JSON lines reader using row-orient input data
-  json_experimental_row_orient
+  json_row_orient
 };
 
-constexpr bool is_legacy_test(json_test_t test_opt)
-{
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::legacy_lines_record_orient;
-}
-
 constexpr bool is_row_orient_test(json_test_t test_opt)
 {
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::json_experimental_row_orient;
+  return test_opt == json_test_t::json_row_orient;
 }
 
 /**
@@ -194,17 +187,10 @@ struct JsonReaderParamTest : public cudf::test::BaseFixture,
                              public testing::WithParamInterface<json_test_t> {};
 
 /**
- * @brief Test fixture for parametrized JSON reader tests, testing record orient-only for legacy
- * JSON lines reader and the nested reader
+ * @brief Test fixture for parametrized JSON reader tests with both orients
  */
-struct JsonReaderDualTest : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
-
-/**
- * @brief Test fixture for parametrized JSON reader tests that only tests the new nested JSON reader
- */
-struct JsonReaderNoLegacy : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
+struct JsonReaderRecordTest : public cudf::test::BaseFixture,
+                              public testing::WithParamInterface<json_test_t> {};
 
 /**
  * @brief Generates a JSON lines string that uses the record orient
@@ -240,9 +226,7 @@ struct JsonFixedPointReaderTest : public JsonReaderTest {};
 
 template <typename DecimalType>
 struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalType> {
-  void run_test(std::vector<std::string> const& reference_strings,
-                numeric::scale_type scale,
-                bool use_legacy_parser)
+  void run_test(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper const strings(reference_strings.begin(),
                                                      reference_strings.end());
@@ -259,8 +243,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
     cudf::io::json_reader_options const in_opts =
       cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .dtypes({data_type{type_to_id<DecimalType>(), scale}})
-        .lines(true)
-        .legacy(use_legacy_parser);
+        .lines(true);
 
     auto const result      = cudf::io::read_json(in_opts);
     auto const result_view = result.tbl->view();
@@ -273,8 +256,8 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
   void run_tests(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     // Test both parsers
-    run_test(reference_strings, scale, false);
-    run_test(reference_strings, scale, true);
+    run_test(reference_strings, scale);
+    run_test(reference_strings, scale);
   }
 };
 
@@ -284,22 +267,13 @@ TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
-                        ::testing::Values(json_test_t::legacy_lines_row_orient,
-                                          json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient,
-                                          json_test_t::json_experimental_row_orient));
+                        ::testing::Values(json_test_t::json_record_orient,
+                                          json_test_t::json_row_orient));
 
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
-INSTANTIATE_TEST_CASE_P(JsonReaderDualTest,
-                        JsonReaderDualTest,
-                        ::testing::Values(json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient));
-
-// Parametrize qualifying JSON tests for executing nested reader only
-INSTANTIATE_TEST_CASE_P(JsonReaderNoLegacy,
-                        JsonReaderNoLegacy,
-                        ::testing::Values(json_test_t::json_experimental_row_orient,
-                                          json_test_t::json_experimental_record_orient));
+INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
+                        JsonReaderRecordTest,
+                        ::testing::Values(json_test_t::json_record_orient));
 
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
@@ -312,8 +286,7 @@ TEST_P(JsonReaderParamTest, BasicJsonLines)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
@@ -355,8 +328,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -380,8 +352,7 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -450,8 +421,7 @@ TEST_P(JsonReaderParamTest, MultiColumn)
                dtype<int64_t>(),
                dtype<float>(),
                dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -500,8 +470,7 @@ TEST_P(JsonReaderParamTest, Booleans)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<bool>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
@@ -544,8 +513,7 @@ TEST_P(JsonReaderParamTest, Dates)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
-      .dayfirst(true)
-      .legacy(is_legacy_test(test_opt));
+      .dayfirst(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -600,8 +568,7 @@ TEST_P(JsonReaderParamTest, Durations)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -638,8 +605,7 @@ TEST_P(JsonReaderParamTest, JsonLinesDtypeInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -674,9 +640,7 @@ TEST_P(JsonReaderParamTest, JsonLinesFileInput)
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -703,7 +667,6 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
-      .legacy(true)  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
       .byte_range_offset(11)
       .byte_range_size(20);
 
@@ -718,18 +681,120 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjects)
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(70);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 10);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{3000, 4000, 5000, 6000, 7000, 8000, 9000, 1000, 2000, 3000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_ExcessRangeSize)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(1000);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 16);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_LoadAllFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}}).lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 18);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
+TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
-  auto const test_opt     = GetParam();
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n";
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -737,7 +802,7 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   EXPECT_EQ(result.tbl->num_rows(), 1);
 
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result.metadata.schema_info[0].name, is_legacy_test(test_opt) ? "co\\\"l1" : "co\"l1");
+  EXPECT_EQ(result.metadata.schema_info[0].name, "co\"l1");
   EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
   EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
 
@@ -745,14 +810,13 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings)
 {
   auto const test_opt    = GetParam();
   auto test_json_objects = [test_opt](std::string const& data) {
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -782,17 +846,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n");
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsMissingData)
 {
-  auto const test_opt = GetParam();
-  // Note: columns will be ordered based on which fields appear first
+  //  Note: columns will be ordered based on which fields appear first
   std::string const data =
     "{              \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col1\":200,               \"col3\":\"bbb\"}\n";
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -819,17 +881,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsOutOfOrder)
 {
-  auto const test_opt = GetParam();
   std::string const data =
     "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n";
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -915,8 +975,7 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
       .dtypes({dtype<int8_t>()})
-      .lines(true)
-      .legacy(true);  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -948,8 +1007,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -968,8 +1026,7 @@ TEST_P(JsonReaderParamTest, StringInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.c_str(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -1050,9 +1107,7 @@ TEST_P(JsonReaderParamTest, ParseInRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1154,9 +1209,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1194,9 +1247,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1213,7 +1264,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
+TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputsNoNL)
 {
   auto const test_opt = GetParam();
   // Strings for the two separate input files in row-orient that do not end with a newline
@@ -1235,9 +1286,7 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1254,15 +1303,16 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_F(JsonReaderTest, BadDtypeParams)
+// This can be removed once the legacy option has been removed.
+// The read_json only throws with legacy(true)
+TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
 {
   std::string buffer = "[1,2,3,4]";
 
   cudf::io::json_reader_options options_vec =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .dtypes({dtype<int8_t>()})
-      .legacy(true);
+      .dtypes({dtype<int8_t>()});
 
   // should throw because there are four columns and only one dtype
   EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error);
@@ -1270,7 +1320,6 @@ TEST_F(JsonReaderTest, BadDtypeParams)
   cudf::io::json_reader_options options_map =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .legacy(true)
       .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
                                                      {"1", dtype<int8_t>()},
                                                      {"2", dtype<int8_t>()},
@@ -1279,9 +1328,9 @@ TEST_F(JsonReaderTest, BadDtypeParams)
   EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error);
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalBasic)
+TEST_F(JsonReaderTest, JsonBasic)
 {
-  std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json";
+  std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])";
   outfile.close();
@@ -1305,7 +1354,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic)
                                  cudf::test::strings_column_wrapper({"1.1", "2.2"}));
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalLines)
+TEST_F(JsonReaderTest, JsonLines)
 {
   std::string const json_string =
     R"({"a":"a0"}
@@ -1324,7 +1373,6 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   auto const table = cudf::io::read_json(json_lines_options);
 
   // Read test data via legacy, non-nested JSON lines reader
-  json_lines_options.enable_legacy(true);
   auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
   // Verify that the data read via non-nested JSON lines reader matches the data read via nested
@@ -1429,8 +1477,7 @@ TEST_F(JsonReaderTest, ErrorStrings)
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{cudf::type_id::STRING}})
-      .lines(true)
-      .legacy(false);
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view().column(0);
@@ -1474,7 +1521,7 @@ TEST_F(JsonReaderTest, TokenAllocation)
   }
 }
 
-TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
+TEST_F(JsonReaderTest, LinesNoOmissions)
 {
   std::array<std::string const, 4> const json_inputs
     // single column
@@ -1502,7 +1549,6 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
     auto const table = cudf::io::read_json(json_lines_options);
 
     // Read test data via legacy, non-nested JSON lines reader
-    json_lines_options.enable_legacy(true);
     auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
     // Verify that the data read via non-nested JSON lines reader matches the data read via
@@ -1588,8 +1634,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(dtype_schema)
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1785,8 +1830,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .dtypes(dtype_schema)
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1820,13 +1864,12 @@ TYPED_TEST(JsonValidFixedPointReaderTest, SingleColumnPositiveScale)
 
 TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
 {
-  auto const buffer = std::string{"{\"col0\":}"};
+  auto const buffer = std::string{"{\"col0\":\"\"}"};
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{type_to_id<TypeParam>(), 0}})
-      .lines(true)
-      .legacy(true);  // Legacy behavior; not aligned with JSON specs
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view();
@@ -1834,7 +1877,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
   ASSERT_EQ(result_view.num_columns(), 1);
   EXPECT_EQ(result_view.num_rows(), 1);
   EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
-  EXPECT_EQ(result_view.column(0).null_count(), 1);
+  EXPECT_EQ(result_view.column(0).null_count(), 0);
 }
 
 TEST_F(JsonReaderTest, UnsupportedMultipleFileInputs)
@@ -2050,6 +2093,109 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
     float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
+// Sanity test that checks whether there's a race on the FST destructor
+TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
+{
+  // Set up host pinned memory pool to avoid implicit synchronizations to test for any potential
+  // races due to missing host-device synchronizations
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  host_pooled_mr mr{std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+                    size_t{128} * 1024 * 1024};
+
+  // Set new resource
+  auto last_mr = cudf::io::set_host_memory_resource(mr);
+
+  /**
+   * @brief Spark has the specific need to ignore extra characters that come after the first record
+   * on a JSON line
+   */
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2}{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid)
+    R"({"c":1.2 } )"
+    "\n"
+    "\n"
+    // 4 -> (valid)
+    R"({"a":4} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> (valid)
+    R"({"a":6} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // Create input of a certain size to potentially reveal a missing host/device sync
+  std::size_t const target_size = 40000000;
+  auto const repetitions_log2 =
+    static_cast<std::size_t>(std::ceil(std::log2(target_size / data.size())));
+  auto const repetitions = 1ULL << repetitions_log2;
+
+  for (std::size_t i = 0; i < repetitions_log2; ++i) {
+    data = data + "\n" + data;
+  }
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 8 * repetitions);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true, true, true, false};
+  std::vector<bool> b_validity{false, false, true, false, false, false, false, false};
+  std::vector<bool> c_validity{false, false, false, true, false, false, false, false};
+
+  std::vector<std::int32_t> a_data{-2, 0, 0, 0, 4, 5, 6, 0};
+  std::vector<std::int32_t> b_a_data{0, 0, 3, 0, 0, 0, 0, 0};
+  std::vector<double> c_data{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0};
+
+  for (std::size_t i = 0; i < repetitions_log2; ++i) {
+    a_validity.insert(a_validity.end(), a_validity.cbegin(), a_validity.cend());
+    b_validity.insert(b_validity.end(), b_validity.cbegin(), b_validity.cend());
+    c_validity.insert(c_validity.end(), c_validity.cbegin(), c_validity.cend());
+    a_data.insert(a_data.end(), a_data.cbegin(), a_data.cend());
+    b_a_data.insert(b_a_data.end(), b_a_data.cbegin(), b_a_data.cend());
+    c_data.insert(c_data.end(), c_data.cbegin(), c_data.cend());
+  }
+
+  // Child column b->a
+  auto b_a_col = int64_wrapper(b_a_data.cbegin(), b_a_data.cend());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0), int64_wrapper{a_data.cbegin(), a_data.cend(), a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()});
+
+  // Restore original memory source
+  cudf::io::set_host_memory_resource(last_mr);
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
@@ -2087,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes)
         .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-    static int num_case                  = 0;
-    num_case++;
-    std::cout << "case:" << num_case << "\n";
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
   };
   // value + string (not mixed type case)
@@ -2291,4 +2434,284 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, JsonLinesDelimiter)
+{
+  using SymbolT = char;
+
+  SymbolT const random_delimiter = GetParam();
+
+  // Test input
+  std::string input             = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+  std::size_t const string_size = 400;
+  /*
+   * We are constructing a JSON lines string where each row is {"col1":100, "col2":1.1,
+   * "col3":"aaa"} and rows are separated by random_delimiter. Instead of concatenating lines
+   * linearly in O(n), we can do it in O(log n) by doubling the input in each iteration. The total
+   * number of such iterations is log_repetitions.
+   */
+  std::size_t const log_repetitions =
+    static_cast<std::size_t>(std::ceil(std::log2(string_size / input.size())));
+  std::size_t const repetitions = 1UL << log_repetitions;
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    input = input + random_delimiter + input;
+  }
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter(random_delimiter);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(json_parser_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), repetitions);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col1");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "col3");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  auto col2_iterator = thrust::constant_iterator<double>(1.1);
+  auto col3_iterator = thrust::constant_iterator<std::string>("aaa");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                 float64_wrapper(col2_iterator, col2_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    cudf::test::strings_column_wrapper(col3_iterator, col3_iterator + repetitions));
+}
+
+TEST_F(JsonReaderTest, ViableDelimiter)
+{
+  // Test input
+  std::string input = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true);
+
+  json_parser_options.set_delimiter('\f');
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(json_parser_options));
+
+  EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
+}
+
+// Test case for dtype prune:
+// all paths, only one.
+// one present, another not present, nothing present
+// nested, flat, not-jsonlines
+TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  std::string json_string  = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .prune_columns(true)
+        .lines(lines);
+
+    // include all columns
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"0", {data_type{cudf::type_id::STRING}}},
+           {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // "b" children checks
+      ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element");
+      // types
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8);
+      EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    //// vector
+    {
+      std::vector<data_type> types{
+        {dtype<int32_t>()}, data_type{cudf::type_id::STRUCT}, {dtype<bool>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"b",
+         {
+           data_type{cudf::type_id::STRUCT},
+         }},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+
+    // include only one column
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// vector
+    {
+      std::vector<data_type> types{{dtype<int32_t>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+
+    // include only one column (nested)
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "b":"1":[float]
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element");
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    // multiple - all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+    // multiple - not all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"d", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    // multiple - not all present nested
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {
+            {"2", {data_type{cudf::type_id::STRING}}},
+          }}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "b" (empty struct) and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0);
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 336d360063f..8ed5fa81b12 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -19,6 +19,7 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
@@ -34,17 +35,26 @@ struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
 void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto stream_view  = cudf::get_default_stream();
+  // Prepare cuda stream for data transfers & kernels
+  auto stream_view = cudf::test::get_default_stream();
+
   auto device_input = cudf::detail::make_device_uvector_async(
     host_input, stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
-    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
-
-  auto const preprocessed_host_output =
-    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
-
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_whitespace(
+    device_data, stream_view, rmm::mr::get_current_device_resource());
+
+  std::string preprocessed_host_output(device_data.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_data.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                stream_view.value()));
+
+  stream_view.synchronize();
   ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 97e1a78f909..d6f800cce8b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -148,6 +148,7 @@ TEST_F(JsonTest, StackContext)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"(  [{)"
                             R"("category": "reference",)"
                             R"("index:": [4,12,42],)"
@@ -171,7 +172,8 @@ TEST_F(JsonTest, StackContext)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -210,6 +212,7 @@ TEST_F(JsonTest, StackContextUtf8)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])";
 
   // Prepare input & output buffers
@@ -220,7 +223,8 @@ TEST_F(JsonTest, StackContextUtf8)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -238,7 +242,18 @@ TEST_F(JsonTest, StackContextUtf8)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecovering)
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, StackContextRecovering)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
@@ -248,13 +263,15 @@ TEST_F(JsonTest, StackContextRecovering)
   auto const stream = cudf::get_default_stream();
 
   // JSON lines input that recovers on invalid lines
-  std::string const input = R"({"a":-2},
+  char const delimiter = GetParam();
+  std::string input    = R"({"a":-2},
   {"a":
   {"a":{"a":[321
   {"a":[1]}
 
   {"b":123}
   )";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Expected stack context (including stack context of the newline characters)
   std::string const golden_stack_context =
@@ -274,7 +291,8 @@ TEST_F(JsonTest, StackContextRecovering)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -287,15 +305,16 @@ TEST_F(JsonTest, StackContextRecovering)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecoveringFuzz)
+TEST_P(JsonDelimiterParamTest, StackContextRecoveringFuzz)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
   using StackSymbolT = char;
 
-  std::random_device rd;
+  char const delimiter = GetParam();
   std::mt19937 gen(42);
   std::uniform_int_distribution<int> distribution(0, 4);
+
   constexpr std::size_t input_length = 1024 * 1024;
   std::string input{};
   input.reserve(input_length);
@@ -313,36 +332,29 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
         case 1: current = '['; break;
         case 2: current = '}'; break;
         case 3: current = '"'; break;
-        case 4: current = '\n'; break;
+        case 4: current = delimiter; break;
       }
-      switch (current) {
-        case '"': inside_quotes = !inside_quotes; break;
-        case '{':
-          if (!inside_quotes) { host_stack.push('{'); }
-          break;
-        case '[':
-          if (!inside_quotes) { host_stack.push('['); }
-          break;
-        case '}':
-          if (!inside_quotes) {
-            if (host_stack.size() > 0) {
-              // Get the proper 'pop' stack symbol
-              current = (host_stack.top() == '{' ? '}' : ']');
-              host_stack.pop();
-            } else
-              is_ok = false;
-          }
-          break;
-        case '\n':
-          // Increase chance to have longer lines
-          if (distribution(gen) == 0) {
-            is_ok = false;
-            break;
-          } else {
-            host_stack    = {};
-            inside_quotes = false;
-            break;
-          }
+      if (current == '"')
+        inside_quotes = !inside_quotes;
+      else if (current == '{' && !inside_quotes)
+        host_stack.push('{');
+      else if (current == '[' && !inside_quotes)
+        host_stack.push('[');
+      else if (current == '}' && !inside_quotes) {
+        if (host_stack.size() > 0) {
+          // Get the proper 'pop' stack symbol
+          current = (host_stack.top() == '{' ? '}' : ']');
+          host_stack.pop();
+        } else
+          is_ok = false;
+      } else if (current == delimiter) {
+        // Increase chance to have longer lines
+        if (distribution(gen) == 0) {
+          is_ok = false;
+        } else {
+          host_stack    = {};
+          inside_quotes = false;
+        }
       }
     } while (!is_ok);
     input += current;
@@ -360,24 +372,19 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
       expected_stack_context += host_stack.top();
     }
 
-    switch (current) {
-      case '"': inside_quotes = !inside_quotes; break;
-      case '{':
-        if (!inside_quotes) { host_stack.push('{'); }
-        break;
-      case '[':
-        if (!inside_quotes) { host_stack.push('['); }
-        break;
-      case '}':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case ']':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case '\n':
-        host_stack    = {};
-        inside_quotes = false;
-        break;
+    if (current == '"')
+      inside_quotes = !inside_quotes;
+    else if (current == '{' && !inside_quotes)
+      host_stack.push('{');
+    else if (current == '[' && !inside_quotes)
+      host_stack.push('[');
+    else if (current == '}' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == ']' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == delimiter) {
+      host_stack    = {};
+      inside_quotes = false;
     }
   }
 
@@ -392,7 +399,8 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -404,7 +412,9 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, TokenStream)
+struct JsonNewlineDelimiterTest : public cudf::test::BaseFixture {};
+
+TEST_F(JsonNewlineDelimiterTest, TokenStream)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -549,7 +559,7 @@ TEST_F(JsonTest, TokenStream)
   }
 }
 
-TEST_F(JsonTest, TokenStream2)
+TEST_F(JsonNewlineDelimiterTest, TokenStream2)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -620,15 +630,12 @@ TEST_F(JsonTest, TokenStream2)
   }
 }
 
-struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(Experimental, JsonParserTest, testing::Bool());
+struct JsonParserTest : public cudf::test::BaseFixture {};
 
-TEST_P(JsonParserTest, ExtractColumn)
+TEST_F(JsonParserTest, ExtractColumn)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -656,29 +663,32 @@ TEST_P(JsonParserTest, ExtractColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_F(JsonTest, RecoveringTokenStream)
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":2 {})"
-                            // 9
-                            "\n"
-                            // 01234 <= line 1
-                            R"({"a":)"
-                            // 5
-                            "\n"
-                            // 67890123456789 <= line 2
-                            R"({"a":{"a":[321)"
-                            // 0
-                            "\n"
-                            // 123456789 <= line 3
-                            R"({"a":[1]})"
-                            // 0
-                            "\n"
-                            // 1  <= line 4
-                            "\n"
-                            // 23456789 <= line 5
-                            R"({"b":123})";
+  char const delimiter = GetParam();
+
+  std::string input = R"({"a":2 {})"
+                      // 9
+                      "\n"
+                      // 01234 <= line 1
+                      R"({"a":)"
+                      // 5
+                      "\n"
+                      // 67890123456789 <= line 2
+                      R"({"a":{"a":[321)"
+                      // 0
+                      "\n"
+                      // 123456789 <= line 3
+                      R"({"a":[1]})"
+                      // 0
+                      "\n"
+                      // 1  <= line 4
+                      "\n"
+                      // 23456789 <= line 5
+                      R"({"b":123})";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Golden token stream sample
   using token_t = cuio_json::token_t;
@@ -720,6 +730,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   cudf::io::json_reader_options default_options{};
   default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
   default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
 
   // Prepare input & output buffers
   cudf::string_scalar const d_scalar(input, true, stream);
@@ -733,6 +744,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
 
+  stream.synchronize();
   // Verify the number of tokens matches
   ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
   ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
@@ -867,27 +879,29 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_P(JsonParserTest, UTF_JSON)
+TEST_P(JsonDelimiterParamTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream      = cudf::get_default_stream();
-  auto mr                = rmm::mr::get_current_device_resource();
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto const stream    = cudf::get_default_stream();
+  auto mr              = rmm::mr::get_current_device_resource();
+  auto json_parser     = cuio_json::detail::device_parse_nested_json;
+  char const delimiter = GetParam();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
+  default_options.set_delimiter(delimiter);
 
   // Only ASCII string
-  std::string const ascii_pass = R"([
+  std::string ascii_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
-  auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(ascii_pass.begin(), ascii_pass.end(), '\n', delimiter);
+
+  auto const d_ascii_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
@@ -895,21 +909,23 @@ TEST_P(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
   // utf-8 string that fails parsing.
-  std::string const utf_failed = R"([
+  std::string utf_failed = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
-  auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_failed.begin(), utf_failed.end(), '\n', delimiter);
+
+  auto const d_utf_failed = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
     stream,
     rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
-  std::string const utf_pass = R"([
+  std::string utf_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
@@ -917,19 +933,19 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
-  auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_pass.begin(), utf_pass.end(), '\n', delimiter);
+
+  auto const d_utf_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
-TEST_P(JsonParserTest, ExtractColumnWithQuotes)
+TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -959,12 +975,10 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_P(JsonParserTest, ExpectFailMixStructAndList)
+TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -1002,12 +1016,10 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   }
 }
 
-TEST_P(JsonParserTest, EmptyString)
+TEST_F(JsonParserTest, EmptyString)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -1028,4 +1040,159 @@ TEST_P(JsonParserTest, EmptyString)
   EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count);
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input:
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})";
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream.resize(28);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     */
+    golden_token_stream.resize(38);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Default parsing options
+  cudf::io::json_reader_options default_options{};
+  default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+  default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
new file mode 100644
index 00000000000..1c1b53ea17f
--- /dev/null
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -0,0 +1,1477 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace {
+enum class output_limit : std::size_t {};
+enum class input_limit : std::size_t {};
+enum class output_row_granularity : cudf::size_type {};
+
+// Global environment for temporary files
+auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using doubles_col      = cudf::test::fixed_width_column_wrapper<double>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable                    = false,
+                std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
+                cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
+{
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset);
+      col = cudf::structs::detail::superimpose_nulls(
+        static_cast<cudf::bitmask_type const*>(null_mask.data()),
+        null_count,
+        std::move(col),
+        cudf::get_default_stream(),
+        rmm::mr::get_current_device_resource());
+
+      // Shift nulls of the next column by one position, to avoid having all nulls
+      // in the same table rows.
+      ++offset;
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc");
+
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .stripe_size_bytes(stripe_size_bytes)
+      .stripe_size_rows(stripe_size_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+// NOTE: By default, output_row_granularity=10'000 rows.
+// This means if the input file has more than 10k rows then the output chunk will never
+// have less than 10k rows.
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  input_limit input_limit_bytes             = input_limit{0},
+                  output_row_granularity output_granularity = output_row_granularity{10'000})
+{
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  auto mr = rmm::mr::get_current_device_resource();
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  // return std::pair(cudf::concatenate(out_tviews), num_chunks);
+
+  // TODO: remove this
+  return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks);
+}
+
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  output_row_granularity output_granularity)
+{
+  return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity);
+}
+
+}  // namespace
+
+struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty");
+  auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid");
+  EXPECT_THROW(
+    chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/),
+    cudf::logic_error);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns,
+                      "chunked_read_simple",
+                      nullable,
+                      cudf::io::default_stripe_size_bytes,
+                      stripe_rows);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL});
+    // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000});
+    EXPECT_EQ(num_chunks, 40);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a limit slightly less than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two granularity segments of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows           = 60'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                               Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each               = A0           80000         80000
+    // 20000 rows of 4 bytes each               = A1           80000         160000
+    // 20000 rows of 4 bytes each               = A2           80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                            Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns, "chunked_read_with_strings", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 segment consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 200000,
+    // thus we consider as having only 200000 bytes in total.
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_no_null");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{199'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{200'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{400'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{399'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 142500,
+    // thus we consider as having only 142500 bytes in total.
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'499UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'500UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{285'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{284'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls)
+  // and from 456k to 473k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls)
+  // and from 330k to 380k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests.
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto const validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release());
+  auto const expected = std::make_unique<cudf::table>(std::move(cols));
+
+  auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
+  auto const stripe_limit_rows = num_rows / 5;
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .stripe_size_rows(stripe_limit_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  auto const byte_limit = stripe_limit_rows * sizeof(int);
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader =
+    cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts);
+
+  do {
+    // Every fourth row is null.
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
+  } while (reader.has_next());
+}
+
+namespace {
+
+std::size_t constexpr input_limit_expected_file_count = 3;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"};
+}
+
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& input,
+                                cudf::size_type stripe_size_rows,
+                                cudf::io::compression_type compression)
+{
+  auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+                          .compression(compression)
+                          .stripe_size_rows(stripe_size_rows)
+                          .build();
+  cudf::io::write_orc(out_opts);
+}
+
+void input_limit_test_write(
+  std::vector<std::string> const& test_files,
+  cudf::table_view const& input,
+  cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  // ZSTD yields a very small decompression size, can be much smaller than SNAPPY.
+  // However, ORC reader typically over-estimates the decompression size of data
+  // compressed by ZSTD to be very large, can be much larger than that of SNAPPY.
+  // That is because ZSTD may use a lot of scratch space at decode time
+  // (2.5x the total decompressed buffer size).
+  // As such, we may see smaller output chunks for the input data compressed by ZSTD.
+  input_limit_test_write_one(
+    test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(
+    test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(
+    test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY);
+}
+
+void input_limit_test_read(int test_location,
+                           std::vector<std::string> const& test_files,
+                           cudf::table_view const& input,
+                           output_limit output_limit_bytes,
+                           input_limit input_limit_bytes,
+                           int const* expected_chunk_counts)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  for (size_t idx = 0; idx < test_files.size(); ++idx) {
+    SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
+                 ", file idx: " + std::to_string(idx));
+    auto const [result, num_chunks] =
+      chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
+    EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
+    // TODO: equal
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+  }
+}
+
+}  // namespace
+
+struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
+{
+  auto constexpr num_rows = 1'000'000;
+  auto const iter1        = thrust::make_constant_iterator(15);
+  auto const col1         = doubles_col(iter1, iter1 + num_rows);
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 13, 10};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
+{
+  auto constexpr num_rows = 1'000'000;
+
+  auto const iter1 = thrust::make_counting_iterator<int>(0);
+  auto const col1  = int32s_col(iter1, iter1 + num_rows);
+
+  auto const iter2 = thrust::make_counting_iterator<double>(0);
+  auto const col2  = doubles_col(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto const col3     = strings_col(str_iter, str_iter + num_rows);
+
+  auto const filename   = std::string{"mixed_columns"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1, col2, col3}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 50, 17};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+namespace {
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) const { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) const { return i % 1024; }
+};
+
+struct char_values {
+  __device__ int8_t operator()(int i) const
+  {
+    int const index = (i / 2) % 3;
+    // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...".
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+
+}  // namespace
+
+TEST_F(OrcChunkedReaderInputLimitTest, ListType)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  auto const filename   = std::string{"list_type"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {3, 40, 3};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {8, 40, 9};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{5 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+  int constexpr str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  // list<int>
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  // strings
+  int constexpr num_chars = num_rows * str_size;
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    str_offset_col->mutable_view().begin<int>(),
+                    offset_gen{str_size});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_chars,
+                    static_cast<int8_t*>(str_chars.data()),
+                    char_values{});
+  auto const str_col =
+    cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {});
+
+  // doubles
+  auto const double_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows,
+                    double_col->mutable_view().begin<double>(),
+                    value_gen<double>{});
+
+  auto const filename   = std::string{"mixed_cols_having_list"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {13, 8, 6};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {13, 15, 17};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
+{
+  // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row
+  // selections.
+  int64_t constexpr num_rows    = 100'517'687l;
+  int constexpr rows_per_stripe = 100'000;
+  static_assert(num_rows % rows_per_stripe != 0,
+                "`num_rows` should not be divisible by `stripe_size_rows`.");
+
+  auto const it    = thrust::make_counting_iterator(0);
+  auto const col   = int32s_col(it, it + num_rows);
+  auto const input = cudf::table_view{{col}};
+
+  auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc");
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+      .stripe_size_rows(rows_per_stripe)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  // Verify metadata.
+  auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(metadata.num_rows(), num_rows);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1);
+
+  int constexpr random_val = 123456;
+
+  // Read some random number or rows that is not stripe size.
+  int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
+
+  // Just shift the read data region back by a random offset.
+  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
+                           .build();
+
+  auto reader = cudf::io::chunked_orc_reader(
+    60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/,
+    rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/,
+    50'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k.
+    EXPECT_TRUE(chunk.tbl->num_rows() == 50000 ||
+                chunk.tbl->num_rows() == num_rows_to_read % 50000);
+
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+    ++num_chunks;
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 13);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
+{
+  using data_type = int16_t;
+  using data_col  = cudf::test::fixed_width_column_wrapper<data_type, int64_t>;
+
+  int64_t constexpr num_rows    = 500'000'000l;
+  int constexpr rows_per_stripe = 1'000'000;
+  int constexpr num_reps        = 10;
+  int64_t constexpr total_rows  = num_rows * num_reps;
+  static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
+
+  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
+    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  });
+  auto const col = data_col(it, it + num_rows);
+  auto const chunk_table = cudf::table_view{{col}};
+
+  std::vector<char> data_buffer;
+  {
+    auto const write_opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer})
+        .stripe_size_rows(rows_per_stripe)
+        .build();
+
+    auto writer = cudf::io::orc_chunked_writer(write_opts);
+    for (int i = 0; i < num_reps; ++i) {
+      writer.write(chunk_table);
+    }
+  }
+
+  // Verify metadata.
+  auto const metadata =
+    cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
+  EXPECT_EQ(metadata.num_rows(), total_rows);
+  EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
+
+  // Read with row selections and memory limit.
+  {
+    int64_t constexpr num_rows_to_read = 5'000'000l;
+    int64_t const num_rows_to_skip =
+      static_cast<int64_t>(metadata.num_rows()) - num_rows_to_read -
+      123456l /*just shift the read data region back by a random offset*/;
+
+    // Check validity of the last 5 million rows.
+    auto const sequence_start = num_rows_to_skip % num_rows;
+    auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected    = cudf::table_view{{skipped_col}};
+
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .skip_rows(num_rows_to_skip)
+                             .num_rows(num_rows_to_read)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */,
+      rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    auto const read_result = cudf::concatenate(tviews);
+    EXPECT_EQ(num_chunks, 11);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+  }
+
+  // The test below requires a huge amount of memory, thus it is disabled by default.
+#ifdef LOCAL_TEST
+  // Read with only output limit -- there is no limit on the memory usage.
+  // However, the reader should be able to detect and load only enough stripes each time
+  // to avoid decoding a table having number of rows that exceeds the column size limit.
+  {
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      static_cast<std::size_t>(rows_per_stripe * 5.7) *
+        sizeof(data_type) /* output limit, equal to 5.7M rows */,
+      0UL /* no input limit */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    int num_chunks          = 0;
+    int64_t num_read_rows   = 0;
+    int64_t test_rows_start = 0;
+    auto test_chunk         = std::unique_ptr<cudf::table>{};
+
+    do {
+      auto chunk            = reader.read_chunk();
+      auto const chunk_rows = chunk.tbl->num_rows();
+
+      // Just randomly select one output chunk to verify.
+      if (num_chunks == 123) {
+        test_rows_start = num_read_rows;
+        test_chunk      = std::move(chunk.tbl);
+      }
+
+      ++num_chunks;
+      num_read_rows += chunk_rows;
+    } while (reader.has_next());
+
+    EXPECT_EQ(num_read_rows, total_rows);
+
+    // Typically, we got a chunk having 5M rows.
+    // However, since the reader internally splits file stripes that are not multiple of 5 stripes,
+    // we may have some extra chunks that have less than 5M rows.
+    EXPECT_EQ(num_chunks, 1002);
+
+    // Verify the selected chunk.
+    using namespace cudf::test::iterators;
+    auto const skipped_col =
+      data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls());
+    auto const expected = cudf::table_view{{skipped_col}};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
+  }
+
+#endif  // LOCAL_TEST
+}
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 24e2e2cfea0..a544a812efb 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -547,7 +548,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  float32_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3(seq_col3, seq_col3 + num_rows);
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
@@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug)
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
 
   constexpr auto num_rows = 150000;
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc");
@@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; });
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view chunk_table({col});
 
   std::vector<char> out_buffer;
@@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view());
 }
 
+TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow)
+{
+  // A sink that throws on write()
+  class throw_sink : public cudf::io::data_sink {
+   public:
+    void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); }
+    void flush() override {}
+    size_t bytes_written() override { return 0; }
+  };
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int8_t> col(sequence, sequence + 10);
+  table_view table({col});
+
+  throw_sink sink;
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  try {
+    writer.write(table);
+  } catch (...) {
+    // ignore the exception; we're testing that close() doesn't throw when the only write() fails
+  }
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+  EXPECT_EQ(out_buffer.size(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 58eee34a108..cff85647725 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1175,7 +1175,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   auto base_path      = temp_env->get_temp_filepath("list");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
 
   auto const stream = cudf::get_default_stream();
@@ -1225,14 +1225,14 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {2, 2, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {3, 3, 1, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {6, 6, 2, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {11, 11, 9, 8};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {10, 9, 8, 7};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
 void tiny_list_rowgroup_test(bool just_list_col)
@@ -1318,7 +1318,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
   constexpr int str_size  = 3;
 
@@ -1400,12 +1400,80 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {3, 3, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {10, 11, 4, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {10, 9, 3, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {20, 21, 15, 14};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {20, 18, 15, 12};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
+{
+  auto const generate_input = [](int num_rows, bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    auto filename = "chunked_out_of_bounds_" + std::to_string(num_rows);
+
+    return write_file(input_columns, filename, nullable, false);
+  };
+
+  auto const read_chunks_with_while_loop = [](cudf::io::chunked_parquet_reader const& reader) {
+    auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    int num_chunks  = 0;
+    // should always be true
+    EXPECT_EQ(reader.has_next(), true);
+    while (reader.has_next()) {
+      out_tables.emplace_back(reader.read_chunk().tbl);
+      num_chunks++;
+    }
+    auto out_tviews = std::vector<cudf::table_view>{};
+    for (auto const& tbl : out_tables) {
+      out_tviews.emplace_back(tbl->view());
+    }
+
+    return std::pair(cudf::concatenate(out_tviews), num_chunks);
+  };
+
+  // empty table to compare with the out of bound chunks
+  auto const empty_table = generate_input(0, false).first;
+
+  {
+    auto constexpr num_rows          = 0;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto constexpr output_read_limit = 1'000;
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 1);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto constexpr num_rows          = 40'000;
+    auto constexpr output_read_limit = 240'000;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 2);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
 }
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index b64cd230bc6..c1211869bcc 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -203,6 +203,7 @@ template std::vector<int8_t> random_values<int8_t>(size_t size);
 template std::vector<int16_t> random_values<int16_t>(size_t size);
 template std::vector<int32_t> random_values<int32_t>(size_t size);
 template std::vector<int64_t> random_values<int64_t>(size_t size);
+template std::vector<__int128_t> random_values<__int128_t>(size_t size);
 template std::vector<uint8_t> random_values<uint8_t>(size_t size);
 template std::vector<uint16_t> random_values<uint16_t>(size_t size);
 template std::vector<uint32_t> random_values<uint32_t>(size_t size);
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 85ada9b38fc..aa9172b0608 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1406,6 +1406,56 @@ TEST_F(ParquetReaderTest, FilterIdentity)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl);
 }
 
+TEST_F(ParquetReaderTest, FilterWithColumnProjection)
+{
+  // col_uint32, col_int64, col_double
+  auto [src, filepath] = create_parquet_with_stats("FilterWithColumnProjection.parquet");
+  auto val             = cudf::numeric_scalar<uint32_t>{10};
+  auto lit             = cudf::ast::literal{val};
+  auto col_ref         = cudf::ast::column_name_reference{"col_uint32"};
+  auto col_index       = cudf::ast::column_reference{0};
+  auto filter_expr     = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index, lit);
+
+  auto predicate = cudf::compute_column(src, filter_expr);
+
+  {  // column_name_reference in parquet filter (not present in column projection)
+    auto read_expr       = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref, lit);
+    auto projected_table = cudf::table_view{{src.get_column(2)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double"})
+                       .filter(read_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  {  // column_reference in parquet filter (indices as per order of column projection)
+    auto col_index2    = cudf::ast::column_reference{1};
+    auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+
+    auto projected_table = cudf::table_view{{src.get_column(2), src.get_column(0)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double", "col_uint32"})
+                       .filter(read_ref_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  // Error cases
+  {  // column_reference is not same type as literal, column_reference index is out of bounds
+    for (auto const index : {0, 2}) {
+      auto col_index2    = cudf::ast::column_reference{index};
+      auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+      auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                         .columns({"col_double", "col_uint32"})
+                         .filter(read_ref_expr);
+      EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
+    }
+  }
+}
+
 TEST_F(ParquetReaderTest, FilterReferenceExpression)
 {
   auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet");
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index ffa672fb564..ad0860e265e 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -24,14 +24,18 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
 
+#include <src/io/parquet/parquet.hpp>
+#include <src/io/parquet/parquet_common.hpp>
+
 #include <fstream>
 
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -63,6 +67,13 @@ void test_durations(mask_op_t mask_op)
 
   auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
 
+  if (use_byte_stream_split) {
+    cudf::io::table_input_metadata expected_metadata(expected);
+    for (auto& col_meta : expected_metadata.column_metadata) {
+      col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+    }
+  }
+
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
@@ -87,10 +98,10 @@ void test_durations(mask_op_t mask_op)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; });
-  test_durations([](auto i) { return (i % 2) != 0; });
-  test_durations([](auto i) { return (i % 3) != 0; });
-  test_durations([](auto i) { return false; });
+  test_durations([](auto i) { return true; }, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false);
+  test_durations([](auto i) { return false; }, false);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -556,9 +567,7 @@ TEST_F(ParquetWriterTest, EmptyList)
   auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
-  using lcw     = cudf::test::lists_column_wrapper<int64_t>;
-  auto expected = lcw{lcw{}, lcw{}, lcw{}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), L0->view());
 }
 
 TEST_F(ParquetWriterTest, DeepEmptyList)
@@ -892,6 +901,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       ASSERT_TRUE(stats.min_value.has_value());
       ASSERT_TRUE(stats.max_value.has_value());
 
+      // check that min and max for the column chunk are exact (i.e. not truncated)
+      ASSERT_TRUE(stats.is_max_value_exact.has_value());
+      EXPECT_TRUE(stats.is_max_value_exact.value());
+      ASSERT_TRUE(stats.is_min_value_exact.has_value());
+      EXPECT_TRUE(stats.is_min_value_exact.value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -1321,6 +1336,45 @@ TEST_F(ParquetWriterTest, CompStatsEmptyTable)
   expect_compression_stats_empty(stats);
 }
 
+TEST_F(ParquetWriterTest, SkipCompression)
+{
+  constexpr auto page_rows      = 1000;
+  constexpr auto row_group_rows = 2 * page_rows;
+  constexpr auto num_rows       = 2 * row_group_rows;
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int> col(sequence, sequence + num_rows, no_nulls());
+
+  auto expected          = table_view{{col, col}};
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[0].set_skip_compression(true);
+
+  auto const filepath = temp_env->get_temp_filepath("SkipCompression.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .max_page_size_rows(page_rows)
+      .row_group_size_rows(row_group_rows)
+      .max_page_fragment_size(page_rows)
+      .metadata(std::move(expected_metadata));
+
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
+
+  // check metadata to make sure column 0 is not compressed and column 1 is
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.codec, cudf::io::parquet::detail::UNCOMPRESSED);
+  EXPECT_EQ(fmd.row_groups[0].columns[1].meta_data.codec, cudf::io::parquet::detail::ZSTD);
+}
+
 TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
 {
   column_wrapper<int32_t> col{{1, 2, 3}, no_nulls()};
@@ -1471,6 +1525,7 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .sorting_columns({{0, false, false}})
       .compression(cudf::io::compression_type::ZSTD);
   cudf::io::write_parquet(opts);
 
@@ -1482,6 +1537,24 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   ASSERT_GT(fmd.row_groups.size(), 0);
   EXPECT_GE(fmd.row_groups[0].total_byte_size,
             static_cast<int64_t>(num_rows * sizeof(column_type)));
+
+  // row group file offset should be first page location
+  EXPECT_EQ(fmd.row_groups[0].file_offset, fmd.row_groups[0].columns[0].meta_data.data_page_offset);
+
+  // ordinal should be set to 0
+  ASSERT_TRUE(fmd.row_groups[0].ordinal.has_value());
+  EXPECT_EQ(fmd.row_groups[0].ordinal.value(), 0);
+
+  // only one column, so total_compressed_size should equal compressed size of first chunk
+  ASSERT_TRUE(fmd.row_groups[0].total_compressed_size.has_value());
+  EXPECT_EQ(fmd.row_groups[0].total_compressed_size.value(),
+            fmd.row_groups[0].columns[0].meta_data.total_compressed_size);
+
+  // test that sorting order was written correctly
+  ASSERT_TRUE(fmd.row_groups[0].sorting_columns.has_value());
+  EXPECT_EQ(fmd.row_groups[0].sorting_columns.value()[0].column_idx, 0);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].descending);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].nulls_first);
 }
 
 TEST_F(ParquetWriterTest, UserRequestedDictFallback)
@@ -1531,6 +1604,7 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   using cudf::io::column_encoding;
   using cudf::io::parquet::detail::Encoding;
   constexpr int num_rows = 500;
+  std::mt19937 engine{31337};
 
   auto const ones = thrust::make_constant_iterator(1);
   auto const col =
@@ -1540,6 +1614,9 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   auto const string_col =
     cudf::test::strings_column_wrapper(strings, strings + num_rows, no_nulls());
 
+  // throw in a list to make sure encoding selection works there too
+  auto list_col = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
   auto const table = table_view({col,
                                  col,
                                  col,
@@ -1551,7 +1628,8 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
                                  string_col,
                                  string_col,
                                  string_col,
-                                 string_col});
+                                 string_col,
+                                 *list_col});
 
   cudf::io::table_input_metadata table_metadata(table);
 
@@ -1573,10 +1651,17 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   set_meta(10, "string_db", column_encoding::DELTA_BINARY_PACKED);
   table_metadata.column_metadata[11].set_name("string_none");
 
-  for (auto& col_meta : table_metadata.column_metadata) {
-    col_meta.set_nullability(false);
+  for (int i = 0; i < 12; i++) {
+    table_metadata.column_metadata[i].set_nullability(false);
   }
 
+  // handle list column separately
+  table_metadata.column_metadata[12].set_name("int32_list").set_nullability(true);
+  table_metadata.column_metadata[12]
+    .child(1)
+    .set_encoding(column_encoding::DELTA_BINARY_PACKED)
+    .set_nullability(true);
+
   auto const filepath = temp_env->get_temp_filepath("UserRequestedEncodings.parquet");
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
@@ -1593,7 +1678,18 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   // no nulls and no repetition, so the only encoding used should be for the data.
   // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY.
   auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
-    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+    auto const& col_meta = fmd.row_groups[0].columns[idx].meta_data;
+    EXPECT_EQ(col_meta.encodings[0], enc);
+
+    // also check encoding stats are written properly
+    ASSERT_TRUE(col_meta.encoding_stats.has_value());
+    auto const& enc_stats = col_meta.encoding_stats.value();
+    for (auto const& ec : enc_stats) {
+      if (ec.page_type == cudf::io::parquet::detail::PageType::DATA_PAGE) {
+        EXPECT_EQ(ec.encoding, enc);
+        EXPECT_EQ(ec.count, 1);
+      }
+    }
   };
 
   // requested plain
@@ -1621,6 +1717,12 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   expect_enc(10, Encoding::PLAIN_DICTIONARY);
   // no request, should use dictionary
   expect_enc(11, Encoding::PLAIN_DICTIONARY);
+  // int list requested delta_binary_packed. it's has level data, so have to search for a match.
+  auto const encodings = fmd.row_groups[0].columns[12].meta_data.encodings;
+  auto const has_delta = std::any_of(encodings.begin(), encodings.end(), [](Encoding enc) {
+    return enc == Encoding::DELTA_BINARY_PACKED;
+  });
+  EXPECT_TRUE(has_delta);
 }
 
 TEST_F(ParquetWriterTest, Decimal128DeltaByteArray)
@@ -1681,6 +1783,181 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  constexpr auto num_rows = 100;
+  std::mt19937 engine{31337};
+  auto col0_data = random_values<int32_t>(num_rows);
+  auto col1_data = random_values<int64_t>(num_rows);
+  auto col2_data = random_values<float>(num_rows);
+  auto col3_data = random_values<double>(num_rows);
+
+  column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int64_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<float> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<double> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+
+  // throw in a list to make sure both decoders are working
+  auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
+  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s");
+  expected_metadata.column_metadata[1].set_name("int64s");
+  expected_metadata.column_metadata[2].set_name("floats");
+  expected_metadata.column_metadata[3].set_name("doubles");
+  expected_metadata.column_metadata[4].set_name("int32list");
+  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+  for (int i = 0; i <= 3; i++) {
+    expected_metadata.column_metadata[i].set_encoding(encoding);
+  }
+
+  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
+{
+  constexpr cudf::size_type num_rows = 100;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+  auto seq_col2                      = random_values<__int128_t>(num_rows);
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), no_nulls(), numeric::scale_type{-5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-9}};
+  auto col2 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-11}};
+
+  auto expected = table_view({col0, col1, col2});
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s").set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_name("int64s").set_decimal_precision(11);
+  expected_metadata.column_metadata[2].set_name("int128s").set_decimal_precision(22);
+  for (auto& col_meta : expected_metadata.column_metadata) {
+    col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+  }
+
+  auto const filepath = temp_env->get_temp_filepath("DecimalByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DurationByteStreamSplit)
+{
+  test_durations([](auto i) { return true; }, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true);
+  test_durations([](auto i) { return false; }, true);
+}
+
+TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
+{
+  srand(31337);
+  using cudf::io::parquet::detail::Encoding;
+  constexpr int fixed_width          = 16;
+  constexpr cudf::size_type num_rows = 200;
+  std::vector<uint8_t> data(num_rows * fixed_width);
+  std::vector<cudf::size_type> offsets(num_rows + 1);
+
+  // fill a num_rows X fixed_width array with random numbers and populate offsets array
+  int cur_offset = 0;
+  for (int i = 0; i < num_rows; i++) {
+    offsets[i] = cur_offset;
+    for (int j = 0; j < fixed_width; j++, cur_offset++) {
+      data[cur_offset] = rand() & 0xff;
+    }
+  }
+  offsets[num_rows] = cur_offset;
+
+  auto data_child = cudf::test::fixed_width_column_wrapper<uint8_t>(data.begin(), data.end());
+  auto off_child  = cudf::test::fixed_width_column_wrapper<int32_t>(offsets.begin(), offsets.end());
+  auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {});
+
+  auto expected = table_view{{*col, *col, *col, *col}};
+  cudf::io::table_input_metadata expected_metadata(expected);
+
+  expected_metadata.column_metadata[0]
+    .set_name("flba_plain")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::PLAIN)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[1]
+    .set_name("flba_split")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[2]
+    .set_name("flba_delta")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[3]
+    .set_name("flba_dict")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DICTIONARY)
+    .set_output_as_binary(true);
+
+  auto filepath = temp_env->get_temp_filepath("WriteFixedLenByteArray.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+
+  // check page headers to make sure each column is encoded with the appropriate encoder
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  // check that the schema retains the FIXED_LEN_BYTE_ARRAY type
+  for (int i = 1; i <= 4; i++) {
+    EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY);
+    EXPECT_EQ(fmd.schema[i].type_length, fixed_width);
+  }
+
+  // no nulls and no repetition, so the only encoding used should be for the data.
+  auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
+    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+  };
+
+  // requested plain
+  expect_enc(0, Encoding::PLAIN);
+  // requested byte_stream_split
+  expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
+  // requested delta_byte_array
+  expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
+  // requested dictionary, but should fall back to plain
+  // TODO: update if we get FLBA working with dictionary encoding
+  expect_enc(3, Encoding::PLAIN);
+}
+
 /////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>
@@ -1864,6 +2141,35 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsByteStreamSplit)
+{
+  srand(42);
+  auto sequence = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return ((std::rand() / 10000) * 1000); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, no_nulls());
+
+  auto expected = table_view{{col}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+
+  auto filepath = temp_env->get_temp_filepath("TimestampsByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 //////////////////////////////
 // writer stress tests
 
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 0c259c81a23..ebadd870091 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows)
   }
 }
 
-TEST_F(FromOptsTest, OverFlowDetection)
-{
-  auto const too_large_for_32bit = std::numeric_limits<int64_t>::max();
-
-  // Too many rows to read until the end of the file
-  EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               std::overflow_error);
-
-  // Should work fine with num_rows
-  EXPECT_NO_THROW(
-    skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit));
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index b42f378d872..c35ad5319e4 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -38,6 +38,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -59,7 +61,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
-            rmm::mr::device_memory_resource* mr),
+            rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -67,7 +69,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index cc37dadffd8..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -657,10 +657,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                      std::vector<cudf::size_type> expected_outputs,
                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
   {
-    auto [result_size, actual_counts] = this->join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
     auto result = this->join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
     std::vector<cudf::size_type> resulting_indices;
@@ -751,19 +747,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                                 cudf::table_view right_conditional,
                                 cudf::ast::operation predicate,
                                 cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
-
-  /**
-   * This method must be implemented by subclasses for specific types of joins.
-   * It should be a simply forwarding of arguments to the appropriate cudf
-   * mixed join size computation API.
-   */
-  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
 };
 
 /**
@@ -781,18 +764,6 @@ struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_semi_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_semi_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
@@ -874,18 +845,6 @@ struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_anti_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_anti_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 5cdf5b2a374..61bb3069308 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -50,14 +52,14 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)>
+  rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
   cudf::table_view const& right_input,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
index 2ac6ad5dd0d..1a9e74df9be 100644
--- a/cpp/tests/labeling/label_bins_tests.cpp
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -64,7 +65,7 @@ TEST(BinColumnErrorTests, TestInvalidLeft)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Right edges type check.
@@ -76,7 +77,7 @@ TEST(BinColumnErrorTests, TestInvalidRight)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Input type check.
@@ -88,7 +89,7 @@ TEST(BinColumnErrorTests, TestInvalidInput)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Number of left and right edges must match.
diff --git a/cpp/tests/large_strings/case_tests.cpp b/cpp/tests/large_strings/case_tests.cpp
new file mode 100644
index 00000000000..e56d984421a
--- /dev/null
+++ b/cpp/tests/large_strings/case_tests.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/case.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct CaseTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(CaseTest, ToLower)
+{
+  auto const wide = this->wide_column();
+  auto input      = cudf::concatenate(std::vector<cudf::column_view>(120000, wide));  // 230MB
+  auto expected   = cudf::strings::to_lower(cudf::strings_column_view(input->view()));
+
+  int const multiplier = 12;
+  std::vector<cudf::column_view> input_cols(multiplier, input->view());
+  std::vector<cudf::size_type> splits;
+  std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable {
+    return input->view().size() * (n++);
+  });
+
+  auto large_input = cudf::concatenate(input_cols);  // 2700MB > 2GB
+  auto const sv    = cudf::strings_column_view(large_input->view());
+  auto result      = cudf::strings::to_lower(sv);
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view());
+  }
+}
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
new file mode 100644
index 00000000000..89be2c307bf
--- /dev/null
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct ConcatenateTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ConcatenateTest, ConcatenateVertical)
+{
+  auto input = this->long_column();
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_cols.clear();
+  input_cols.push_back(input);           // regular column
+  input_cols.push_back(result->view());  // large column
+  result = cudf::concatenate(input_cols);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.size() * multiplier);
+  sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+}
+
+TEST_F(ConcatenateTest, ManyColumns)
+{
+  auto input           = this->wide_column();
+  auto view            = cudf::column_view(input);
+  int const multiplier = 1200000;
+  std::vector<cudf::column_view> input_cols(multiplier, view);  // 2500MB > 2GB
+  // this tests a unique path through the code
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
new file mode 100644
index 00000000000..59e0cd43d05
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/testing_main.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace cudf::test {
+class LargeStringsData {
+ public:
+  using DataPointer = std::unique_ptr<cudf::table>;
+
+  virtual ~LargeStringsData() {}
+
+  void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
+  {
+    _data[std::string(name)] = std::move(data);
+  }
+
+  cudf::table_view get_table(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
+  }
+
+  void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
+  {
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.emplace_back(std::move(data));
+    _data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
+  }
+
+  cudf::column_view get_column(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
+  }
+
+  bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }
+
+ protected:
+  std::map<std::string, DataPointer> _data;
+};
+
+cudf::column_view StringsLargeTest::wide_column()
+{
+  std::string name{"wide1"};
+  if (!g_ls_data->has_key(name)) {
+    auto input =
+      cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "The result does not include the value in the sum in"});
+    auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
+    auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
+    g_ls_data->add_column(name, std::move(result));
+  }
+  return g_ls_data->get_column(name);
+}
+
+cudf::column_view StringsLargeTest::long_column()
+{
+  std::string name("long1");
+  if (!g_ls_data->has_key(name)) {
+    auto itr = thrust::constant_iterator<std::string_view>(
+      "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+    auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+    g_ls_data->add_column(name, input.release());
+  }
+  return g_ls_data->get_column(name);
+}
+
+std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
+{
+  CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
+  auto lsd_data = std::make_unique<LargeStringsData>();
+  g_ls_data     = lsd_data.get();
+  return lsd_data;
+}
+
+LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
+}  // namespace cudf::test
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  // hardcoding the CUDA memory resource to keep from exceeding the pool
+  auto mr = cudf::test::make_cuda();
+  rmm::mr::set_current_device_resource(mr.get());
+  auto adaptor = make_stream_mode_adaptor(cmd_opts);
+
+  // create object to automatically be destroyed at the end of main()
+  auto lsd = cudf::test::StringsLargeTest::get_ls_data();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp
new file mode 100644
index 00000000000..8827b65f1ce
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+
+namespace cudf::test {
+class LargeStringsData;
+
+/**
+ * @brief Fixture for creating large strings tests
+ *
+ * Stores tests strings columns for reuse by specific tests.
+ * Creating the test input only once helps speed up the overall tests.
+ *
+ * Also automatically enables appropriate large strings environment variables.
+ */
+struct StringsLargeTest : public cudf::test::BaseFixture {
+  /**
+   * @brief Returns a column of long strings
+   */
+  cudf::column_view wide_column();
+
+  /**
+   * @brief Returns a long column of strings
+   */
+  cudf::column_view long_column();
+
+  large_strings_enabler g_ls_enabler;
+  static LargeStringsData* g_ls_data;
+
+  static std::unique_ptr<LargeStringsData> get_ls_data();
+};
+}  // namespace cudf::test
diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp
new file mode 100644
index 00000000000..afe6e424371
--- /dev/null
+++ b/cpp/tests/large_strings/merge_tests.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct MergeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(MergeTest, MergeLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
new file mode 100644
index 00000000000..007c08ce0fb
--- /dev/null
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+namespace {
+
+cudf::test::TempDirTestEnvironment* const g_temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+}  // namespace
+
+struct ParquetStringsTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ParquetStringsTest, ReadLargeStrings)
+{
+  // need to create a string column larger than `threshold`
+  auto const col0        = this->long_column();
+  auto const column_size = cudf::strings_column_view(col0).chars_size(cudf::get_default_stream());
+  auto const threshold   = column_size - 1;
+  auto const expected    = cudf::table_view{{col0, col0, col0}};
+
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[1].set_encoding(
+    cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY);
+  expected_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY);
+
+  // set smaller threshold to reduce file size and execution time
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  auto const filepath = g_temp_env->get_temp_filepath("ReadLargeStrings.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .stats_level(cudf::io::STATISTICS_NONE)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result      = cudf::io::read_parquet(default_in_opts);
+  auto const result_view = result.tbl->view();
+  for (auto cv : result_view) {
+    auto const offsets = cudf::strings_column_view(cv).offsets();
+    EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64});
+  }
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}
diff --git a/cpp/tests/large_strings/reshape_tests.cpp b/cpp/tests/large_strings/reshape_tests.cpp
new file mode 100644
index 00000000000..b688a40a8d3
--- /dev/null
+++ b/cpp/tests/large_strings/reshape_tests.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+struct ReshapeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ReshapeTest, InterleaveLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+
+  auto result = cudf::interleave_columns(input_views);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check regular sizes returns 32-bit offsets
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::interleave_columns(input_views);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 008003a08a1..bf088eb855a 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/combine.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -53,7 +54,7 @@ TEST_F(ListConcatenateRowsTest, InvalidInput)
     auto const col1 = IntListsCol{}.release();
     auto const col2 = StrListsCol{}.release();
     EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
-                 cudf::logic_error);
+                 cudf::data_type_error);
   }
 }
 
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index e97600a76d3..74545903eb3 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/filling.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -200,8 +201,8 @@ TEST_F(NumericSequencesTest, InvalidSizesInput)
   auto const steps  = IntsCol{};
   auto const sizes  = FWDCol<float>{};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::data_type_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, MismatchedColumnSizesInput)
@@ -220,7 +221,7 @@ TEST_F(NumericSequencesTest, MismatchedColumnTypesInput)
   auto const steps  = FWDCol<float>{1, 2, 3};
   auto const sizes  = IntsCol{1, 2, 3};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, InputHasNulls)
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 4177ee9bc98..521e1193036 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -193,21 +193,6 @@ TEST_F(HashPartition, IdentityHashFailure)
     cudf::logic_error);
 }
 
-TEST_F(HashPartition, UnsupportedHashFunction)
-{
-  fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-  fixed_width_column_wrapper<int16_t> integers({1, 2, 3, 4, 5, 6, 7, 8});
-  strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"});
-  auto input = cudf::table_view({floats, integers, strings});
-
-  auto columns_to_hash = std::vector<cudf::size_type>({2});
-
-  cudf::size_type const num_partitions = 3;
-  EXPECT_THROW(
-    cudf::hash_partition(input, columns_to_hash, num_partitions, cudf::hash_id::HASH_MD5),
-    cudf::logic_error);
-}
-
 TEST_F(HashPartition, CustomSeedValue)
 {
   fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index bb33de1f1e7..239c9ce6ddd 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +42,7 @@ TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
@@ -53,7 +54,7 @@ TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
@@ -69,7 +70,7 @@ TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, InValidCase1)
@@ -640,7 +641,7 @@ TYPED_TEST(FixedPointTest, MismatchedScalarScales)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
@@ -655,7 +656,7 @@ TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale_type{-4}};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 6c23dd6bdc8..9603ea44a76 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::data_type_error);
 }
 
 // Error: column type mismatch
@@ -68,7 +68,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::data_type_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -659,14 +659,14 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsError)
   cudf::test::fixed_width_column_wrapper<int64_t> replacement_w({1, 2, 3, 4});
   auto replacement = cudf::dictionary::encode(replacement_w);
 
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
-  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::data_type_error);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input_one_w({1}, {0});
   auto input_one  = cudf::dictionary::encode(input_one_w);
   auto dict_input = cudf::dictionary_column_view(input_one->view());
   auto dict_repl  = cudf::dictionary_column_view(replacement->view());
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 8685e7300ba..1858cd7782e 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -63,7 +64,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
 
   EXPECT_THROW(
     cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
 
 // Error: nulls in old-values
@@ -97,9 +98,7 @@ TEST_F(ReplaceStringsTest, Strings)
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
                     input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "b", "c", "d", "e", "f", "g", "h"};
-  std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
-  cudf::test::strings_column_wrapper expected_wrapper{
-    expected.begin(), expected.end(), ex_valid.begin()};
+  cudf::test::strings_column_wrapper expected_wrapper{expected.begin(), expected.end()};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }
@@ -160,7 +159,6 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
   std::vector<std::string> replacement{"a", ""};
   std::vector<cudf::valid_type> replacement_valid{1, 1};
   std::vector<std::string> expected{"", "", "", "", "", "", "", ""};
-  std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper input_wrapper{input.begin(), input.end()};
   cudf::test::strings_column_wrapper values_to_replace_wrapper{values_to_replace.begin(),
                                                                values_to_replace.end()};
@@ -170,8 +168,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
                     input_wrapper, values_to_replace_wrapper, replacement_wrapper));
-  cudf::test::strings_column_wrapper expected_wrapper{
-    expected.begin(), expected.end(), ex_valid.begin()};
+  cudf::test::strings_column_wrapper expected_wrapper{expected.begin(), expected.end()};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index 21da19a5a38..f98e685ed0c 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -37,8 +37,7 @@ TEST_F(JSONTest, JSONreader)
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32},
                                            cudf::data_type{cudf::type_id::FLOAT64}})
-      .lines(true)
-      .legacy(true);
+      .lines(true);
   cudf::io::table_with_metadata result =
     cudf::io::read_json(in_options, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
new file mode 100644
index 00000000000..53dd1eed459
--- /dev/null
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+class ReductionTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReductionTest, ReductionSum)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ReductionSumScalarInit)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               *init_scalar,
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSum)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           *init_scalar,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ScanMin)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::scan(input,
+             *cudf::make_min_aggregation<cudf::scan_aggregation>(),
+             cudf::scan_type::INCLUSIVE,
+             cudf::null_policy::EXCLUDE,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, MinMax)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::minmax(input, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index b22d7257041..a1bb87a43fb 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -152,6 +153,8 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 1d82d785ae8..bb0e77a29d0 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings)
 {
   // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
   cudf::test::strings_column_wrapper input{
-    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
@@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings)
   results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  view    = cudf::strings_column_view(cudf::slice(input, {1, 3}).front());
+  results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
 }
 
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 64123690aea..35d648f16e0 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -96,18 +97,11 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);
 
   // check string data
-  auto h_chars_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
-                                  strings_view.chars_size(cudf::get_default_stream())),
-    cudf::get_default_stream());
-  auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::size_type const>(
-      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
-      strings_view.size() + 1),
-    cudf::get_default_stream());
-  EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
-  EXPECT_EQ(
-    memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
+  cudf::test::strings_column_wrapper expected(
+    h_test_strings.begin(),
+    h_test_strings.end(),
+    cudf::test::iterators::nulls_from_nullptrs(h_test_strings));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(column->view(), expected);
 }
 
 TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index f668c384787..9fa1a3325b4 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,8 +26,6 @@
 
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, IsFloat)
@@ -89,7 +88,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats32)
@@ -118,38 +117,41 @@ TEST_F(StringsConvertTest, FromFloats32)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
   // clang-format off
   std::vector<const char*> h_strings{
-    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "1234",   "",         "-876",     "543.2",         "-0.12",   ".25",
     "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
     "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
     "-Inf",   "-INFINITY", "1.0",     "1.7976931348623157e+308",  "1.7976931348623157e-307",
     // subnormal numbers:           v--- smallest double               v--- result is 0
-    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" };
+    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324",
+    // another very small number
+    "9.299999257686047e-0005603333574677677" };
   // clang-format on
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   std::vector<double> h_expected;
   std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
-    h_expected.push_back(str ? std::atof(str) : 0);
+    h_expected.push_back(std::atof(str));
   });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
 
   cudf::test::fixed_width_column_wrapper<double> expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+    h_expected.begin(), h_expected.end(), validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::is_float(strings_view);
+  cudf::test::fixed_width_column_wrapper<bool> is_expected(
+    {1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, is_expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats64)
@@ -178,7 +180,7 @@ TEST_F(StringsConvertTest, FromFloats64)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 9d08ac9c00c..0539895c5f4 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -220,6 +221,8 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
 
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c5a5a342471..1acb4fc4265 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
@@ -50,29 +51,24 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
     auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   {
-    cudf::test::strings_column_wrapper expected{"th",
-                                                "he",
-                                                "fo",
-                                                "ox",
-                                                "ju",
-                                                "um",
-                                                "mp",
-                                                "pe",
-                                                "ed",
-                                                "ov",
-                                                "ve",
-                                                "er",
-                                                "th",
-                                                "hé",
-                                                "do",
-                                                "og"};
+    LCW expected({LCW({"th", "he"}),
+                  LCW({"fo", "ox"}),
+                  LCW({"ju", "um", "mp", "pe", "ed"}),
+                  LCW({"ov", "ve", "er"}),
+                  LCW({"th", "hé"}),
+                  LCW({"do", "og"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 2);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"};
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW({"thé"}),
+                  LCW({"dog"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -80,24 +76,29 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 
 TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 {
-  std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(5);
+  cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"},
+                                           validity);
   auto const separator = cudf::string_scalar("_");
 
-  cudf::strings_column_view strings_view(strings);
+  cudf::strings_column_view sv(input);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
+    auto const results = nvtext::generate_ngrams(sv, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"};
-    auto const results = nvtext::generate_character_ngrams(strings_view, 3);
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW{},
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW{},
+                  LCW({"the"}),
+                  LCW({"dog"})});
+    auto const results = nvtext::generate_character_ngrams(sv, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors)
   auto const separator = cudf::string_scalar("_");
   // invalid parameter value
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
-               cudf::logic_error);
+               std::invalid_argument);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
-               cudf::logic_error);
+               std::invalid_argument);
+  auto const invalid_separator = cudf::string_scalar("", false);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator),
+               std::invalid_argument);
   // not enough strings to generate ngrams
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
                cudf::logic_error);
@@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
   auto view  = cudf::strings_column_view(input);
 
   // invalid parameter value
-  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument);
   // strings not long enough to generate ngrams
   EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
 }
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
index 1015370fe4b..8384cb3480b 100644
--- a/cpp/tests/transform/one_hot_encode_tests.cpp
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -198,7 +199,7 @@ TEST_F(OneHotEncodingTest, MismatchTypes)
   auto input    = cudf::test::strings_column_wrapper{"xx", "yy", "xx"};
   auto category = cudf::test::fixed_width_column_wrapper<int64_t>{1};
 
-  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::logic_error);
+  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::data_type_error);
 }
 
 TEST_F(OneHotEncodingTest, List)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 2cd7dc1574c..7cc2777972e 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -31,6 +31,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -238,11 +239,6 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
 
 template <bool check_exact_equality>
 struct column_property_comparator {
-  bool types_equivalent(cudf::data_type const& lhs, cudf::data_type const& rhs)
-  {
-    return is_fixed_point(lhs) ? lhs.id() == rhs.id() : lhs == rhs;
-  }
-
   bool compare_common(cudf::column_view const& lhs,
                       cudf::column_view const& rhs,
                       cudf::column_view const& lhs_row_indices,
@@ -252,9 +248,9 @@ struct column_property_comparator {
     bool result = true;
 
     if (check_exact_equality) {
-      PROP_EXPECT_EQ(lhs.type(), rhs.type());
+      PROP_EXPECT_EQ(cudf::have_same_types(lhs, rhs), true);
     } else {
-      PROP_EXPECT_EQ(types_equivalent(lhs.type(), rhs.type()), true);
+      PROP_EXPECT_EQ(cudf::column_types_equivalent(lhs, rhs), true);
     }
 
     auto const lhs_size = check_exact_equality ? lhs.size() : lhs_row_indices.size();
@@ -1011,5 +1007,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
   return {std::move(host_data), bitmask_to_host(c)};
 }
 
+large_strings_enabler::large_strings_enabler(bool default_enable)
+{
+  default_enable ? enable() : disable();
+}
+
+large_strings_enabler::~large_strings_enabler() { disable(); }
+
+void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }
+
+void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/utilities/base_fixture.cpp b/cpp/tests/utilities/random_seed.cpp
similarity index 100%
rename from cpp/tests/utilities/base_fixture.cpp
rename to cpp/tests/utilities/random_seed.cpp
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index 6981ad71f1e..e5a153bf781 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -25,6 +25,11 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <src/io/utilities/base64_utilities.hpp>
+
+using cudf::io::detail::base64_decode;
+using cudf::io::detail::base64_encode;
+
 class IoUtilitiesTest : public cudf::test::BaseFixture {};
 
 TEST(IoUtilitiesTest, HostMemoryGetAndSet)
@@ -63,3 +68,114 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet)
   // reset memory resource back
   cudf::io::set_host_memory_resource(last_mr);
 }
+
+TEST(IoUtilitiesTest, Base64EncodeAndDecode)
+{
+  // a vector of lorem ipsum strings
+  std::vector<std::string> strings = {
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut ",
+    "labore et dolore magna aliqua. Id ornare arcu odio ut sem. Ultrices neque ornare aenean ",
+    "euismod elementum nisi quis. Faucibus pulvinar elementum integer enim. Ut tortor pretium ",
+    "viverra suspendisse potenti nullam ac tortor vitae. Elementum pulvinar etiam non quam lacus. ",
+    "Fermentum odio eu feugiat pretium nibh. Commodo ullamcorper a lacus vestibulum sed arcu. "
+    "Elit ",
+    "ut aliquam purus sit amet luctus venenatis lectus magna. Aliquet enim tortor at auctor urna ",
+    "nunc id cursus metus. Vivamus at augue eget arcu dictum. Ultricies leo integer malesuada "
+    "nunc ",
+    "vel risus commodo viverra maecenas.Netus et malesuada fames ac turpis egestas. Erat ",
+    "pellentesque adipiscing commodo elit at imperdiet. Commodo nulla facilisi nullam vehicula. ",
+    "Morbi tristique senectus et netus et. Cursus vitae congue mauris rhoncus aenean vel elit ",
+    "scelerisque mauris. Eros donec ac odio tempor orci dapibus ultrices. Purus in mollis nunc "
+    "sed ",
+    "id. Justo eget magna fermentum iaculis eu. Diam maecenas ultricies mi eget. Justo laoreet "
+    "sit ",
+    "amet cursus sit amet. Nibh venenatis cras sed felis eget velit aliquet sagittis id. Dui ut ",
+    "ornare lectus sit amet est placerat in egestas. Malesuada nunc vel risus commodo viverra ",
+    "maecenas accumsan lacus. Arcu non odio euismod lacinia at. Euismod elementum nisi quis ",
+    "eleifend quam adipiscing vitae proin sagittis. Eget sit amet tellus cras adipiscing enim ",
+    "eu.Neque ornare aenean euismod elementum nisi quis eleifend quam adipiscing. Posuere ",
+    "sollicitudin aliquam ultrices sagittis orci a scelerisque purus. Lobortis elementum nibh ",
+    "tellus molestie. Et ligula ullamcorper malesuada proin libero nunc consequat interdum "
+    "varius. ",
+    "Neque volutpat ac tincidunt vitae semper quis lectus. Nunc mi ipsum faucibus vitae. Congue "
+    "eu ",
+    "consequat ac felis donec et. Faucibus in ornare quam viverra orci sagittis. Egestas "
+    "fringilla ",
+    "phasellus faucibus scelerisque eleifend. Sem fringilla ut morbi tincidunt augue. Lobortis ",
+    "elementum nibh tellus molestie nunc non. Ultrices neque ornare aenean euismod elementum. ",
+    "Cursus turpis massa tincidunt dui ut ornare lectus sit. Eu facilisis sed odio morbi quis "
+    "commodo odio. Tortor dignissim convallis aenean et tortor at risus. Sed euismod nisi porta ",
+    "lorem. In ornare quam viverra orci sagittis. Sed blandit libero volutpat sed cras. Quis ",
+    "viverra nibh cras pulvinar mattis nunc sed blandit libero. Non tellus orci ac auctor augue. ",
+    "Mattis molestie a iaculis at erat pellentesque adipiscing. Est lorem ipsum dolor sit amet ",
+    "consectetur. Commodo odio aenean sed adipiscing. Nunc lobortis mattis aliquam faucibus "
+    "purus. ",
+    "Pellentesque massa placerat duis ultricies lacus. Sed viverra tellus in hac habitasse "
+    "platea. ",
+    "Ut porttitor leo a diam sollicitudin tempor id eu. Rhoncus aenean vel elit scelerisque "
+    "mauris ",
+    "pellentesque pulvinar pellentesque. Ornare quam viverra orci sagittis. Interdum consectetur ",
+    "libero id faucibus nisl tincidunt eget. Eget est lorem ipsum dolor sit amet. Malesuada fames ",
+    "ac turpis egestas integer eget aliquet nibh. Scelerisque felis imperdiet proin fermentum "
+    "leo. ",
+    "Duis convallis convallis tellus id interdum velit. Sit amet massa vitae tortor condimentum ",
+    "lacinia quis vel. Eu turpis egestas pretium aenean pharetra. Sed enim ut sem viverra aliquet ",
+    "eget sit amet tellus. Feugiat nisl pretium fusce id velit ut tortor. In hendrerit gravida ",
+    "rutrum quisque non tellus orci ac auctor. Sit amet nulla facilisi morbi. Nunc congue nisi ",
+    "vitae suscipit tellus. Posuere morbi leo urna molestie at elementum eu. Egestas sed tempus ",
+    "urna et pharetra pharetra. Sed euismod nisi porta lorem. At elementum eu facilisis sed. Odio ",
+    "aenean sed adipiscing diam donec. Congue nisi vitae suscipit tellus mauris a diam. Fringilla ",
+    "urna porttitor rhoncus dolor purus non enim praesent. Eget gravida cum sociis natoque. ",
+    "Facilisis mauris sit amet massa vitae tortor. Vulputate odio ut enim blandit volutpat ",
+    "maecenas volutpat blandit. Ut ornare lectus sit amet est placerat in. Quis vel eros donec ac ",
+    "odio tempor orci dapibus ultrices. Venenatis lectus magna fringilla urna porttitor rhoncus ",
+    "dolor. Mattis vulputate enim nulla aliquet porttitor lacus. Lectus nulla at volutpat diam ut ",
+    "venenatis tellus in. Et ligula ullamcorper malesuada proin libero nunc consequat interdum. "
+    "Ut ",
+    "enim blandit volutpat maecenas volutpat blandit aliquam etiam erat. Pellentesque pulvinar ",
+    "pellentesque habitant morbi tristique senectus et. Auctor eu augue ut lectus arcu bibendum "
+    "at ",
+    "varius. Posuere ac ut consequat semper viverra nam. Sed euismod nisi porta lorem mollis ",
+    "aliquam ut. Porttitor eget dolor morbi non arcu risus quis varius. Adipiscing bibendum est ",
+    "ultricies integer quis auctor. Hac habitasse platea dictumst quisque sagittis purus sit amet ",
+    "volutpat. Nullam vehicula ipsum a arcu cursus vitae. Velit scelerisque in dictum non ",
+    "consectetur a erat nam at. Nulla facilisi cras fermentum odio eu. Tincidunt augue interdum ",
+    "velit euismod in pellentesque massa placerat. Suspendisse potenti nullam ac tortor vitae ",
+    "purus faucibus ornare. Amet dictum sit amet justo donec enim diam vulputate. Tellus ",
+    "pellentesque eu tincidunt tortor aliquam nulla facilisi cras. Mauris in aliquam sem "
+    "fringilla ",
+    "ut morbi tincidunt. Volutpat diam ut venenatis tellus in metus. Sed pulvinar proin gravida ",
+    "hendrerit lectus a. Feugiat nisl pretium fusce id velit ut tortor pretium viverra. Non ",
+    "consectetur a erat nam. Fermentum odio eu feugiat pretium nibh ipsum consequat nisl. Donec ",
+    "pretium vulputate sapien nec. Purus sit amet luctus venenatis lectus magna fringilla. Mauris ",
+    "cursus mattis molestie a iaculis. A iaculis at erat pellentesque adipiscing. Auctor augue ",
+    "mauris augue neque gravida in fermentum et sollicitudin. Lectus quam id leo in vitae turpis ",
+    "massa sed. Erat nam at lectus urna duis convallis convallis. Dignissim cras tincidunt ",
+    "lobortis feugiat vivamus at augue eget arcu. Eleifend mi in nulla posuere sollicitudin ",
+    "aliquam ultrices sagittis. Pellentesque nec nam aliquam sem. Feugiat in fermentum posuere ",
+    "urna nec tincidunt praesent. Morbi non arcu risus quis varius quam quisque. Morbi tristique ",
+    "senectus et netus et malesuada fames ac. Et ligula ullamcorper malesuada proin libero. ",
+    "Vivamus at augue eget arcu dictum varius duis at consectetur. Eget mauris pharetra et ",
+    "ultrices neque ornare aenean euismod. Sapien faucibus et molestie ac feugiat sed lectus ",
+    "vestibulum mattis. Blandit turpis cursus in hac habitasse platea dictumst quisque sagittis. ",
+    "Fermentum iaculis eu non diam phasellus vestibulum. Mattis aliquam faucibus purus in massa ",
+    "tempor nec feugiat nisl. Lectus sit amet est placerat. Accumsan sit amet nulla facilisi "
+    "morbi ",
+    "tempus iaculis urna. Magna eget est lorem ipsum dolor sit. Curabitur gravida arcu ac tortor ",
+    "dignissim convallis aenean."};
+
+  std::vector<std::string> base64_roundtripped_strings;
+
+  std::transform(strings.begin(),
+                 strings.end(),
+                 std::back_inserter(base64_roundtripped_strings),
+                 [&](auto& str) { return base64_decode(base64_encode(str)); });
+
+  // Create columns for expected and results
+  cudf::test::strings_column_wrapper expected(strings.begin(), strings.end());
+  cudf::test::strings_column_wrapper results(base64_roundtripped_strings.begin(),
+                                             base64_roundtripped_strings.end());
+
+  // Check equal columns
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results);
+}
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 9c23798fce6..fecb896f95a 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-namespace cudf {
-namespace test {
-
 template <typename T>
 struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {};
 
@@ -35,56 +33,56 @@ TYPED_TEST_SUITE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
 {
-  fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, SameString)
 {
-  strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
-  strings_column_wrapper lhs2{}, rhs2{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  cudf::test::strings_column_wrapper lhs2{}, rhs2{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
-  strings_column_wrapper lhs3{}, rhs3{};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  cudf::test::strings_column_wrapper lhs3{}, rhs3{};
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 }
 
 TEST_F(ColumnTypeCheckTest, SameList)
 {
-  using LCW = lists_column_wrapper<int32_t>;
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
 
   LCW lhs{}, rhs{};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   LCW lhs2{{1, 2, 3}}, rhs2{{4, 5}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
   LCW lhs3{{LCW{1}, LCW{2, 3}}}, rhs3{{LCW{4, 5}}};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 
   LCW lhs4{{LCW{1}, LCW{}, LCW{2, 3}}}, rhs4{{LCW{4, 5}, LCW{}}};
-  EXPECT_TRUE(column_types_equal(lhs4, rhs4));
+  EXPECT_TRUE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameDictionary)
 {
-  using DCW = dictionary_column_wrapper<TypeParam>;
+  using DCW = cudf::test::dictionary_column_wrapper<TypeParam>;
   DCW lhs{1, 1, 2, 3}, rhs{5, 5};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   DCW lhs2{}, rhs2{};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, SameStruct)
 {
-  using SCW      = structs_column_wrapper;
-  using FCW      = fixed_width_column_wrapper<int32_t>;
-  using StringCW = strings_column_wrapper;
-  using LCW      = lists_column_wrapper<int32_t>;
-  using DCW      = dictionary_column_wrapper<int32_t>;
+  using SCW      = cudf::test::structs_column_wrapper;
+  using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using StringCW = cudf::test::strings_column_wrapper;
+  using LCW      = cudf::test::lists_column_wrapper<int32_t>;
+  using DCW      = cudf::test::dictionary_column_wrapper<int32_t>;
 
   FCW lf1{1, 2, 3}, rf1{0, 1};
   StringCW lf2{"a", "bb", ""}, rf2{"cc", "d"};
@@ -92,127 +90,158 @@ TEST_F(ColumnTypeCheckTest, SameStruct)
   DCW lf4{5, 5, 5}, rf4{9, 9};
 
   SCW lhs{lf1, lf2, lf3, lf4}, rhs{rf1, rf2, rf3, rf4};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentBasics)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  strings_column_wrapper rhs1{"a", "bb"};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::strings_column_wrapper rhs1{"a", "bb"};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  lists_column_wrapper<string_view> lhs2{{"hello"}, {"world", "!"}};
-  strings_column_wrapper rhs2{"", "kk"};
+  cudf::test::lists_column_wrapper<cudf::string_view> lhs2{{"hello"}, {"world", "!"}};
+  cudf::test::strings_column_wrapper rhs2{"", "kk"};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<int32_t> lhs3{1, 1};
-  dictionary_column_wrapper<int32_t> rhs3{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs3{1, 1};
+  cudf::test::dictionary_column_wrapper<int32_t> rhs3{2, 2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
-  structs_column_wrapper rhs4{rhs2, rhs3};
+  cudf::test::lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
+  cudf::test::structs_column_wrapper rhs4{rhs2, rhs3};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  fixed_width_column_wrapper<int64_t> rhs1{2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs1{2};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  fixed_width_column_wrapper<float> lhs2{1, 1};
-  fixed_width_column_wrapper<double> rhs2{2};
+  cudf::test::fixed_width_column_wrapper<float> lhs2{1, 1};
+  cudf::test::fixed_width_column_wrapper<double> rhs2{2};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<timestamp_ms> lhs3{1, 1};
-  fixed_width_column_wrapper<timestamp_us> rhs3{2};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms> lhs3{1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us> rhs3{2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  fixed_width_column_wrapper<duration_D> lhs4{};
-  fixed_width_column_wrapper<duration_us> rhs4{42};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_D> lhs4{};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_us> rhs4{42};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 
   // Same rep, different scale
-  fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
-  fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
+  cudf::test::fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
-  EXPECT_FALSE(column_types_equal(lhs5, rhs5));
-  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
+  EXPECT_FALSE(cudf::have_same_types(lhs5, rhs5));
+  EXPECT_TRUE(cudf::column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
-  fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
-  fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
 
-  EXPECT_FALSE(column_types_equal(lhs6, rhs6));
+  EXPECT_FALSE(cudf::have_same_types(lhs6, rhs6));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentDictionary)
 {
-  dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<duration_s, uint32_t> rhs3{8, 8};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<cudf::duration_s, uint32_t> rhs3{8, 8};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentLists)
 {
-  using LCW_i = lists_column_wrapper<int32_t>;
-  using LCW_f = lists_column_wrapper<float>;
+  using LCW_i = cudf::test::lists_column_wrapper<int32_t>;
+  using LCW_f = cudf::test::lists_column_wrapper<float>;
 
   // Different nested level
   LCW_i lhs1{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_i rhs1{LCW_i{LCW_i{8, 8, 8}, LCW_i{9, 9}}, LCW_i{LCW_i{42, 42}}};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
   // Different base column type
   LCW_i lhs2{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_f rhs2{LCW_f{9.0, 9.1}, LCW_f{3.14}, LCW_f{}};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentStructs)
 {
-  fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
-  fixed_width_column_wrapper<int64_t> rf1{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rf1{2, 2};
+
+  cudf::test::structs_column_wrapper lhs1{lf1};
+  cudf::test::structs_column_wrapper rhs1{rf1};
 
-  structs_column_wrapper lhs1{lf1};
-  structs_column_wrapper rhs1{rf1};
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  cudf::test::fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> rf2{2, 2};
 
-  fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
-  fixed_width_column_wrapper<int32_t> rf2{2, 2};
+  cudf::test::strings_column_wrapper lf3{"a", "b", "c"};
 
-  strings_column_wrapper lf3{"a", "b", "c"};
+  cudf::test::structs_column_wrapper lhs2{lf2, lf3};
+  cudf::test::structs_column_wrapper rhs2{rf2};
 
-  structs_column_wrapper lhs2{lf2, lf3};
-  structs_column_wrapper rhs2{rf2};
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
+}
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+TYPED_TEST(ColumnTypeCheckTestTyped, AllTypesEqual)
+{
+  {
+    // An empty table
+    cudf::table_view tbl{};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with one column
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::table_view tbl{{col1}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with all the same types
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col2{4, 5, 6};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col3{7, 8, 9};
+    cudf::table_view tbl{{col1, col2, col3}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
 }
 
-}  // namespace test
-}  // namespace cudf
+TEST_F(ColumnTypeCheckTest, AllTypesNotEqual)
+{
+  // A table with different types
+  cudf::test::fixed_width_column_wrapper<int> col1{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col2{3.14, 1.57, 2.71};
+  cudf::table_view tbl{{col1, col2}};
+  EXPECT_FALSE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index edc0677f244..0844d86fb66 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,6 +6,7 @@ files:
       cuda: ["11.8", "12.2"]
       arch: [x86_64]
     includes:
+      - build_base
       - build_all
       - build_cpp
       - build_wheels
@@ -27,6 +28,10 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - depends_on_cupy
+  test_static_build:
+    output: none
+    includes:
+      - build_base
   test_cpp:
     output: none
     includes:
@@ -45,6 +50,7 @@ files:
   test_java:
     output: none
     includes:
+      - build_base
       - build_all
       - cuda
       - cuda_version
@@ -75,6 +81,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
       - build_python_cudf
   py_run_cudf:
@@ -112,6 +119,29 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
+  py_build_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project
+    includes:
+      - run_cudf_polars
+      - depends_on_cudf
+  py_test_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_dask_cudf:
     output: pyproject
     pyproject_dir: python/dask_cudf
@@ -144,6 +174,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
   py_run_cudf_kafka:
     output: pyproject
@@ -191,12 +222,16 @@ channels:
   - conda-forge
   - nvidia
 dependencies:
-  build_all:
+  build_base:
     common:
-      - output_types: conda
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
           - &ninja ninja
+  build_all:
+    common:
+      - output_types: conda
+        packages:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.8,<1.0
@@ -235,11 +270,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - &gbench benchmark==1.8.0
-          - &gtest gtest>=1.13.0
-          - &gmock gmock>=1.13.0
-          - librmm==24.4.*
-          - libkvikio==24.4.*
+          - librmm==24.6.*
+          - libkvikio==24.6.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -254,12 +286,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - *cmake_ver
           - cython>=3.0.3
-          - *ninja
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.2.*
+          - pyarrow==16.1.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -275,14 +305,10 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.4.*
-          - &protobuf protobuf>=3.20,<5
+          - &rmm_conda rmm==24.6.*
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
-      - output_types: [requirements, pyproject]
-        packages:
-          - protoc-wheel
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -295,13 +321,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.4.*
+              - &rmm_cu12 rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.4.*
-          - {matrix: null, packages: null }
-      - output_types: pyproject
-        matrices:
+              - &rmm_cu11 rmm-cu11==24.6.*
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -309,27 +332,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==14.0.2.*
-          - libarrow-dataset==14.0.2.*
-          - libarrow==14.0.2.*
-          - libparquet==14.0.2.*
+          - libarrow-acero==16.1.0.*
+          - libarrow-dataset==16.1.0.*
+          - libarrow==16.1.0.*
+          - libparquet==16.1.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
-          # Allow runtime version to float up to minor version
-          # Disallow libarrow 14.0.0 due to a CVE
-          - libarrow-acero>=14.0.1,<15.0.0a0
-          - libarrow-dataset>=14.0.1,<15.0.0a0
-          - libarrow>=14.0.1,<15.0.0a0
-          - libparquet>=14.0.1,<15.0.0a0
+          # Allow runtime version to float up to patch version
+          - libarrow-acero>=16.1.0,<16.2.0a0
+          - libarrow-dataset>=16.1.0,<16.2.0a0
+          - libarrow>=16.1.0,<16.2.0a0
+          - libparquet>=16.1.0,<16.2.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          # Disallow pyarrow 14.0.0 due to a CVE
-          - pyarrow>=14.0.1,<15.0.0a0
+          # Allow runtime version to float up to patch version
+          - pyarrow>=16.1.0,<16.2.0a0
   cuda_version:
     specific:
       - output_types: conda
@@ -456,7 +477,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - *doxygen
           - make
           - myst-nb
@@ -504,7 +525,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - numpy>=1.23,<2.0a0
-          - pandas>=2.0,<2.2.2dev0
+          - pandas>=2.0,<2.2.3dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -515,7 +536,6 @@ dependencies:
           - packaging
           - rich
           - typing_extensions>=4.0.0
-          - *protobuf
       - output_types: conda
         packages:
           - *rmm_conda
@@ -548,22 +568,24 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.4.*
+              - rmm-cu12==24.6.*
               - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.4.*
+              - rmm-cu11==24.6.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
-          - {matrix: null, packages: null}
-      - output_types: pyproject
-        matrices:
-          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
+          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
+  run_cudf_polars:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - polars>=0.20.24
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.4.*
+          - rapids-dask-dependency==24.6.*
   run_custreamz:
     common:
       - output_types: conda
@@ -580,9 +602,6 @@ dependencies:
       - output_types: conda
         packages:
           - *cmake_ver
-          - *gbench
-          - *gtest
-          - *gmock
     specific:
       - output_types: conda
         matrices:
@@ -632,7 +651,7 @@ dependencies:
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
-          - &transformers transformers==4.38.1
+          - &transformers transformers==4.39.3
           - tzdata
     specific:
       - output_types: conda
@@ -652,13 +671,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.4.*
+          - &cudf_conda cudf==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -670,16 +689,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.4.*
+              - cudf-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.4.*
+              - cudf-cu11==24.6.*
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.4.*
+          - &cudf_kafka_conda cudf_kafka==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -691,10 +710,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.4.*
+              - cudf_kafka-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.4.*
+              - cudf_kafka-cu11==24.6.*
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7afc8fe19bf..bcefa3fbdf8 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -142,8 +142,6 @@ def clean_all_xml_files(path):
                 tree.write(fn)
 
 
-
-
 # Breathe Configuration
 breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
 for project_path in breathe_projects.values():
@@ -187,7 +185,9 @@ def clean_all_xml_files(path):
 # The short X.Y version.
 version = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}"
 # The full version.
-release = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+release = (
+    f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+)
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -306,8 +306,12 @@ def clean_all_xml_files(path):
 intersphinx_mapping = {
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
     "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
+    "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pandas": ("https://pandas.pydata.org/docs/", None),
+    "pandas": (
+        "https://pandas.pydata.org/pandas-docs/stable/",
+        None,
+    ),
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "python": ("https://docs.python.org/3", None),
     "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
@@ -379,7 +383,7 @@ def _generate_namespaces(namespaces):
     "type_id",
     # Unknown base types
     "int32_t",
-    "void"
+    "void",
 }
 
 
@@ -447,9 +451,14 @@ def _cached_intersphinx_lookup(env, node, contnode):
 
 def on_missing_reference(app, env, node, contnode):
     # These variables are defined outside the function to speed up the build.
-    global _all_namespaces, _names_to_skip_in_cpp, \
-        _names_to_skip_in_pylibcudf, _intersphinx_extra_prefixes, \
-        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+    global \
+        _all_namespaces, \
+        _names_to_skip_in_cpp, \
+        _names_to_skip_in_pylibcudf, \
+        _intersphinx_extra_prefixes, \
+        _domain_objects, \
+        _prefixed_domain_objects, \
+        _intersphinx_cache
 
     # Precompute and cache domains for faster lookups
     if _domain_objects is None:
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index dde7afb1360..55976740105 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -151,15 +151,3 @@ for testing or benchmarking purposes. To do so, set the
 ```bash
 CUDF_PANDAS_FALLBACK_MODE=1 python -m cudf.pandas some_script.py
 ```
-
-## Slow tab completion in IPython?
-
-You may experience slow tab completion when inspecting the
-methods/attributes of large dataframes. We expect this issue to be
-resolved in an upcoming release. In the mean time, you may execute the
-following command in IPython before loading `cudf.pandas` to work
-around the issue:
-
-```
-%config IPCompleter.jedi_compute_type_timeout=0
-```
diff --git a/docs/cudf/source/cudf_pandas/index.rst b/docs/cudf/source/cudf_pandas/index.rst
index 628194cc8a5..f98c04cc383 100644
--- a/docs/cudf/source/cudf_pandas/index.rst
+++ b/docs/cudf/source/cudf_pandas/index.rst
@@ -34,10 +34,8 @@ automatically **falling back to pandas** for other operations.
 | Nothing changes, not even your `import` statements, when going from CPU to GPU.             | Combines the full flexibility of Pandas with blazing fast performance of cuDF                                       |
 +---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
 
-Starting with the version 23.10.01 release ``cudf.pandas`` is
-available in Open Beta, as part of the ``cudf`` package .  See `RAPIDS
-Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running
-with ``cudf``.
+``cudf.pandas`` is now Generally Available (GA) as part of the ``cudf`` package.  See `RAPIDS
+Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``cudf``.
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 65b0e4e3f41..6fce268f309 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -16,12 +16,10 @@ The `.pre-commit-config.yaml` file at the root of the repo is the primary source
 Specifically, cuDF uses the following tools:
 
 - [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`black`](https://github.com/psf/black) is an automatic code formatter.
 - [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
-- [`pydocstyle`](https://github.com/PyCQA/pydocstyle/) lints docstring style.
 - [`codespell`](https://github.com/codespell-project/codespell) finds spelling errors.
 
 Linter config data is stored in a number of files.
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index 26557de917a..c8da689479c 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -72,7 +72,7 @@ Our guidelines include one addition to the standard the `numpydoc` guide.
 Class properties, which are not explicitly covered, should be documented in the getter function.
 That choice makes `help` more useful as well as enabling docstring inheritance in subclasses.
 
-All of our docstrings are validated using [`pydocstyle`](http://www.pydocstyle.org/en/stable/).
+All of our docstrings are validated using [`ruff pydocstyle rules`](https://docs.astral.sh/ruff/rules/#pydocstyle-d).
 This ensures that docstring style is consistent and conformant across the codebase.
 
 ## Published documentation
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 0120cbb286e..0b881b2b057 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above:
 - The object returned from libcudf is immediately converted to a pylibcudf type.
 - `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
 
+## Testing
+
+When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already.
+The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf.
+If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing.
+
+pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures.
+Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types.
+Where appropriate, new fixture lists may be added.
+
+To run tests as efficiently as possible, the test suite should make generous use of fixtures.
+The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list.
+Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call.
+This approach ensures consistent global coverage across types for various tests.
+
+In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
+This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
+Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
+
+Here is an example demonstrating the above points:
+
+```python
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from cudf._lib import pylibcudf as plc
+from utils import assert_column_eq
+
+# The pa_dtype fixture is defined in conftest.py.
+@pytest.fixture(scope="module")
+def pa_column(pa_dtype):
+    pa.array([1, 2, 3])
+
+
+@pytest.fixture(scope="module")
+def column(pa_column):
+    return plc.interop.from_arrow(pa_column)
+
+
+def test_foo(pa_column, column):
+    index = 1
+    result = plc.foo(column)
+    expected = pa.foo(pa_column)
+
+    assert_column_eq(result, expected)
+```
+
+Some guidelines on what should be tested:
+- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage.
+- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests.
+  - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
+- Nullable data should always be tested.
+- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+
+Some guidelines on how best to use pytests.
+- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
+- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic.
+
+Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module.
+
+The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above:
+- [](#test_parametrization)
+- [](#xfailing_tests)
+- [](#testing_warnings)
+
 ## Miscellaneous Notes
 
 ### Cython Scoped Enums
diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
index a28a6b9192d..f12f809d5db 100644
--- a/docs/cudf/source/developer_guide/testing.md
+++ b/docs/cudf/source/developer_guide/testing.md
@@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s
 The main exception to this rule is tests based on comparison to pandas.
 Such tests may test exceptional cases alongside more typical cases since the logic is generally identical.
 
+(test_parametrization)=
+
 ### Parametrization: custom fixtures and `pytest.mark.parametrize`
 
 When it comes to parametrizing tests written with `pytest`,
@@ -140,6 +142,8 @@ def test_odds():
 
 Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review.
 
+(xfailing_tests)=
+
 ### Tests with expected failures (`xfail`s)
 
 In some circumstances it makes sense to mark a test as _expected_ to
@@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this
 point (and we will remember to update the test).
 
 
+(testing_warnings)=
+
 ### Testing code that throws warnings
 
 Some code may be expected to throw warnings.
diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index 549d91b771a..4aaaa8a93df 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.
 
 ## Result ordering
 
-By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
-do *not* guarantee output ordering.
-Compare the results obtained from Pandas and cuDF below:
+In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
+certain guarantees about the order of rows in the result returned.  In a Pandas
+`join`, the order of join keys is (depending on the particular style of join
+being performed) either preserved or sorted lexicographically by default.
+`groupby` sorts the group keys, and preserves the order of rows within each
+group. In some cases, disabling this option in Pandas can yield better
+performance.
+
+By contrast, cuDF's default behavior is to return rows in a
+non-deterministic order to maximize performance.  Compare the results
+obtained from Pandas and cuDF below:
 
 ```{code} python
 >>> import cupy as cp
@@ -114,13 +122,16 @@ a
 4  342.000000
 ```
 
-To match Pandas behavior, you must explicitly pass `sort=True`
-or enable the `mode.pandas_compatible` option when trying to
-match Pandas behavior with `sort=False`:
+In most cases, the rows of a DataFrame are accessed by index labels
+rather than by position, so the order in which rows are returned
+doesn't matter. However, if you require that results be returned in a
+predictable (sorted) order, you can pass the `sort=True` option
+explicitly or enable the `mode.pandas_compatible` option when trying
+to match Pandas behavior with `sort=False`:
 
 ```{code} python
->>> df.to_pandas().groupby("a", sort=True).mean().head()
-            b
+>>> df.groupby("a", sort=True).mean().head()
+         b
 a
 0   70.000000
 1  356.333333
diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
index db32f4bbcb3..ab10f4af4fa 100644
--- a/docs/dask_cudf/source/api.rst
+++ b/docs/dask_cudf/source/api.rst
@@ -13,12 +13,11 @@ Creating and storing DataFrames
 of DataFrames from a variety of storage formats. For on-disk data that
 are not supported directly in Dask-cuDF, we recommend using Dask's
 data reading facilities, followed by calling
-:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+:meth:`*.to_backend("cudf")` to obtain a Dask-cuDF object.
 
 .. automodule:: dask_cudf
    :members:
       from_cudf,
-      from_dask_dataframe,
       from_delayed,
       read_csv,
       read_json,
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
deleted file mode 100644
index b2c620848de..00000000000
--- a/java/ci/Dockerfile.centos7
+++ /dev/null
@@ -1,56 +0,0 @@
-#
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# Build the image for cudf development environment.
-#
-# Arguments: CUDA_VERSION=11.X.Y
-#
-###
-ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-
-### Install basic requirements
-ARG DEVTOOLSET_VERSION=11
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
-RUN yum install -y git zlib-devel maven tar wget patch ninja-build
-
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-ARG CMAKE_VERSION=3.26.4
-RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
-
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
new file mode 100644
index 00000000000..6b87f3ed34e
--- /dev/null
+++ b/java/ci/Dockerfile.rocky
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+###
+# Build the image for cudf development environment.
+#
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/amd64]
+#
+###
+ARG CUDA_VERSION=11.8.0
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
+### Install basic requirements
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
+RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
+
+# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
+ARG CMAKE_VERSION=3.26.4
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
+
+# ccache for interactive builds
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel 4 --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
+
+# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
+ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/java/ci/README.md b/java/ci/README.md
index d8007a98234..18ad3cc4d0d 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,14 +11,14 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-centos7 .
+docker build -f java/ci/Dockerfile.rocky --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-rocky8 .
 ```
 
 The following CUDA versions are supported w/ CUDA Enhanced Compatibility:
 * CUDA 11.0+
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.8.0-devel-rocky8" with another name you like.
 
 ## Start the docker then build
 
@@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 ```
 
 ### Download the cuDF source code
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.04
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
 ```
 
 ### Build cuDF jar with devtoolset
@@ -42,9 +42,9 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.04
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-11 "java/ci/build-in-docker.sh"
+scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 ```
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.04.1-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.06.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index d8965f646ea..46b5ce4c083 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.04.1-SNAPSHOT</version>
+    <version>24.06.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index fe559184878..c60323775ce 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -49,11 +49,12 @@ public enum BinaryOp {
   LESS_EQUAL(25), // <=
   GREATER_EQUAL(26), // >=
   NULL_EQUALS(27), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
-  NULL_MAX(28), // MAX but NULL < not NULL
-  NULL_MIN(29), // MIN but NULL > not NULL
+  NULL_NOT_EQUALS(28), // negation of NULL_EQUALS
+  NULL_MAX(29), // MAX but NULL < not NULL
+  NULL_MIN(30), // MIN but NULL > not NULL
   //NOT IMPLEMENTED YET GENERIC_BINARY(30);
-  NULL_LOGICAL_AND(31),
-  NULL_LOGICAL_OR(32);
+  NULL_LOGICAL_AND(32),
+  NULL_LOGICAL_OR(33);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 48a7861f1a1..6e8d862213e 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -546,6 +546,20 @@ default ColumnVector equalToNullAware(BinaryOperable rhs) {
     return equalToNullAware(rhs, DType.BOOL8);
   }
 
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs, DType outType) {
+    return binaryOp(BinaryOp.NULL_NOT_EQUALS, rhs, outType);
+  }
+
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs) {
+    return notEqualToNullAware(rhs, DType.BOOL8);
+  }
+
   /**
    * Returns the max non null value.
    */
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index ba58f53931b..5a0fbd224ad 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -772,42 +772,7 @@ public static ColumnVector md5Hash(ColumnView... columns) {
           "Unsupported nested type column";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
-   * Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param seed integer seed for the murmur3 hash function
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) {
-    if (columns.length < 1) {
-      throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
-    }
-    long[] columnViews = new long[columns.length];
-    long size = columns[0].getRowCount();
-
-    for(int i = 0; i < columns.length; i++) {
-      assert columns[i] != null : "Column vectors passed may not be null";
-      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
-      assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
-      columnViews[i] = columns[i].getNativeView();
-    }
-    return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), seed));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table with the
-   * seed set to 0. Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(ColumnView columns[]) {
-    return spark32BitMurmurHash3(0, columns);
+    return new ColumnVector(md5(columnViews));
   }
 
   /**
@@ -914,15 +879,12 @@ private static native long stringConcatenationSepCol(long[] columnViews,
                                                        boolean separate_nulls);
 
   /**
-   * Native method to hash each row of the given table. Hashing function dispatched on the
-   * native side using the hashId.
+   * Native method to MD5 hash each row of the given table
    *
    * @param viewHandles array of native handles to the cudf::column_view columns being operated on.
-   * @param hashId integer native ID of the hashing function identifier HashType.
-   * @param seed integer seed for the hash. Only used by serial murmur3 hash.
    * @return native handle of the resulting cudf column containing the hex-string hashing results.
    */
-  private static native long hash(long[] viewHandles, int hashId, int seed) throws CudfException;
+  private static native long md5(long[] viewHandles) throws CudfException;
 
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index 081e8aa6700..50d6b866579 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,9 +23,7 @@
  */
 public enum HashType {
   IDENTITY(0),
-  MURMUR3(1),
-  HASH_SPARK_MURMUR3(2),
-  HASH_MD5(3);
+  MURMUR3(1);
 
   private static final HashType[] HASH_TYPES = HashType.values();
   final int nativeId;
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 27322cca436..7ee590e3c82 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ public class NativeDepsLoader {
   public static synchronized void loadNativeDeps() {
     if (!loaded) {
       try {
-        loadNativeDeps(loadOrder);
+        loadNativeDeps(loadOrder, preserveDepsAfterLoad);
         loaded = true;
       } catch (Throwable t) {
         log.error("Could not load cudf jni library...", t);
@@ -122,11 +122,53 @@ public static synchronized void loadNativeDeps() {
    * @throws IOException on any error trying to load the libraries.
    */
   public static void loadNativeDeps(String[] loadOrder) throws IOException {
+    loadNativeDeps(loadOrder, preserveDepsAfterLoad);
+  }
+
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * <br/>
+   * Because this just loads the libraries and loading the libraries themselves needs to be a
+   * singleton operation it is recommended that any library using this provide their own wrapper
+   * function similar to
+   * <pre>
+   *     private static boolean loaded = false;
+   *     static synchronized void loadNativeDeps() {
+   *         if (!loaded) {
+   *             try {
+   *                 // If you also depend on the cudf liobrary being loaded, be sure it is loaded
+   *                 // first
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps();
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps(new String[]{...});
+   *                 loaded = true;
+   *             } catch (Throwable t) {
+   *                 log.error("Could not load ...", t);
+   *             }
+   *         }
+   *     }
+   * </pre>
+   * This function should be called from the static initialization block of any class that uses
+   * JNI. For example
+   * <pre>
+   *     public class UsesJNI {
+   *         static {
+   *             MyNativeDepsLoader.loadNativeDeps();
+   *         }
+   *     }
+   * </pre>
+   * @param loadOrder the base name of the libraries. For example libfoo.so would be passed in as
+   *                  "foo".  The libraries are loaded in the order provided.
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static void loadNativeDeps(String[] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
     for (String toLoad : loadOrder) {
-      loadDep(os, arch, toLoad);
+      loadDep(os, arch, toLoad, preserveDeps);
     }
   }
 
@@ -134,9 +176,11 @@ public static void loadNativeDeps(String[] loadOrder) throws IOException {
    * Load native dependencies in stages, where the dependency libraries in each stage
    * are loaded only after all libraries in earlier stages have completed loading.
    * @param loadOrder array of stages with an array of dependency library names in each stage
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
    * @throws IOException on any error trying to load the libraries
    */
-  private static void loadNativeDeps(String[][] loadOrder) throws IOException {
+  private static void loadNativeDeps(String[][] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
@@ -161,7 +205,7 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
       // Submit all dependencies in the stage to be loaded in parallel
       loadCompletionFutures.clear();
       for (Future<File> fileFuture : stageFileFutures) {
-        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture)));
+        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture, preserveDeps)));
       }
 
       // Wait for all dependencies in this stage to have been loaded
@@ -177,28 +221,46 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
     executor.shutdownNow();
   }
 
-  private static void loadDep(String os, String arch, String baseName) throws IOException {
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Library will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * @param depName the base name of the library. For example libfoo.so would be passed in as
+   *                "foo".  The libraries are loaded in the order provided.
+   * @param preserveDep if false the dependencies will be deleted immediately after loading
+   *                    rather than on exit.
+   * @return path where the dependency was loaded
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static File loadNativeDep(String depName, boolean preserveDep) throws IOException {
+    String os = System.getProperty("os.name");
+    String arch = System.getProperty("os.arch");
+    return loadDep(os, arch, depName, preserveDep);
+  }
+
+  private static File loadDep(String os, String arch, String baseName, boolean preserveDep)
+      throws IOException {
     File path = createFile(os, arch, baseName);
-    loadDep(path);
+    loadDep(path, preserveDep);
+    return path;
   }
 
   /** Load a library at the specified path */
-  private static void loadDep(File path) {
+  private static void loadDep(File path, boolean preserveDep) {
     System.load(path.getAbsolutePath());
-    if (!preserveDepsAfterLoad) {
+    if (!preserveDep) {
       path.delete();
     }
   }
 
   /** Load a library, waiting for the specified future to produce the path before loading */
-  private static void loadDep(Future<File> fileFuture) {
+  private static void loadDep(Future<File> fileFuture, boolean preserveDep) {
     File path;
     try {
       path = fileFuture.get();
     } catch (ExecutionException | InterruptedException e) {
       throw new RuntimeException("Error loading dependencies", e);
     }
-    loadDep(path);
+    loadDep(path, preserveDep);
   }
 
   /** Extract the contents of a library resource into a temporary file */
diff --git a/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
new file mode 100644
index 00000000000..2f46c8d1825
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
@@ -0,0 +1,169 @@
+/*
+ *
+ *  Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Provide an interface for reading an ORC file in an iterative manner.
+ */
+public class ORCChunkedReader implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Construct the reader instance from read limits, output row granularity,
+   * and a file already loaded in a memory buffer.
+   *
+   * @param chunkReadLimit Limit on total number of bytes to be returned per read,
+   *                       or 0 if there is no limit.
+   * @param passReadLimit  Limit on the amount of memory used by the chunked reader,
+   *                       or 0 if there is no limit.
+   * @param opts           The options for ORC reading.
+   * @param buffer         Raw ORC file content.
+   * @param offset         The starting offset into buffer.
+   * @param len            The number of bytes to parse the given buffer.
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReader(chunkReadLimit, passReadLimit,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Construct a chunked ORC reader instance, similar to
+   * {@link ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)},
+   * with an additional parameter to control the granularity of the output table.
+   * When reading a chunk table, with respect to the given size limits, a subset of stripes may
+   * be loaded, decompressed and decoded into a large intermediate table. The reader will then
+   * subdivide that table into smaller tables for final output using
+   * {@code outputRowSizingGranularity} as the subdivision step. If the chunked reader is
+   * constructed without this parameter, the default value of 10k rows will be used.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReaderWithOutputGranularity(chunkReadLimit, passReadLimit, outputRowSizingGranularity,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Check if the given file has anything left to read.
+   *
+   * @return A boolean value indicating if there is more data to read from file.
+   */
+  public boolean hasNext() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    if (firstCall) {
+      // This function needs to return true at least once, so an empty table
+      // (but having empty columns instead of no column) can be returned by readChunk()
+      // if the input file has no row.
+      firstCall = false;
+      return true;
+    }
+    return hasNext(handle);
+  }
+
+  /**
+   * Read a chunk of rows in the given ORC file such that the returning data has total size
+   * does not exceed the given read limit. If the given file has no data, or all data has been read
+   * before by previous calls to this function, a null Table will be returned.
+   *
+   * @return A table of new rows reading from the given file.
+   */
+  public Table readChunk() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    long[] columnPtrs = readChunk(handle);
+    return columnPtrs != null ? new Table(columnPtrs) : null;
+  }
+
+  @Override
+  public void close() {
+    if (handle != 0) {
+      close(handle);
+      handle = 0;
+    }
+  }
+
+
+  /**
+   * Auxiliary variable to help {@link #hasNext()} returning true at least once.
+   */
+  private boolean firstCall = true;
+
+  /**
+   * Handle for memory address of the native ORC chunked reader class.
+   */
+  private long handle;
+
+  /**
+   * Create a native chunked ORC reader object on heap and return its memory address.
+   *
+   * @param chunkReadLimit    Limit on total number of bytes to be returned per read,
+   *                          or 0 if there is no limit.
+   * @param passReadLimit     Limit on the amount of memory used by the chunked reader,
+   *                          or 0 if there is no limit.
+   * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
+   * @param bufferAddrs       The address of a buffer to read from, or 0 if we are not using that buffer.
+   * @param length            The length of the buffer to read from.
+   * @param usingNumPyTypes   Whether the parser should implicitly promote TIMESTAMP
+   *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
+   * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
+   */
+  private static native long createReader(long chunkReadLimit, long passReadLimit,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  /**
+   * Create a native chunked ORC reader object, similar to
+   * {@link ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])},
+   * with an additional parameter to control the granularity of the output table.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])
+   */
+  private static native long createReaderWithOutputGranularity(
+      long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  private static native boolean hasNext(long handle);
+
+  private static native long[] readChunk(long handle);
+
+  private static native void close(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 6cb34683e5a..83b801db7fb 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -252,4 +252,20 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
   private synchronized void free(long address, long size) {
     Rmm.freeFromPinnedPool(this.poolHandle, address, size);
   }
+
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
+   */
+  public static synchronized boolean configureDefaultCudfPinnedPoolSize(long size) {
+    return Rmm.configureDefaultCudfPinnedPoolSize(size);
+  }
+
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 6e9f90e477f..4dee1b7aa24 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -266,6 +266,19 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     }
   }
 
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
+   */
+  public static synchronized native boolean configureDefaultCudfPinnedPoolSize(long size);
+
   /**
    * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
    * not enabled.
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c8571dd841c..43603386649 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * The schema of data to be read in.
@@ -221,6 +222,13 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  public int getNumChildren() {
+    if (childSchemas == null) {
+      return 0;
+    }
+    return childSchemas.size();
+  }
+
   int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
@@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
     return false;
   }
 
-  public static class Builder {
+  public HostColumnVector.DataType asHostDataType() {
+    if (topLevelType == DType.LIST) {
+      assert(childSchemas != null && childSchemas.size() == 1);
+      HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
+      return new HostColumnVector.ListType(true, element);
+    } else if (topLevelType == DType.STRUCT) {
+      if (childSchemas == null) {
+        return new HostColumnVector.StructType(true);
+      } else {
+        List<HostColumnVector.DataType> childTypes =
+                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+        return new HostColumnVector.StructType(true, childTypes);
+      }
+    } else {
+      return new HostColumnVector.BasicType(true, topLevelType);
+    }
+  }
+
+    public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 5ce2f9d2d6e..4e737451ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -732,32 +732,14 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri
                                                        long leftConditionTable, long rightConditionTable,
                                                        long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
-  private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -1238,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
               columns[i] = tbl.getColumn(index).incRefCount();
             }
           } else {
-            try (Scalar s = Scalar.fromNull(types[i])) {
-              columns[i] = ColumnVector.fromScalar(s, rowCount);
+            if (types[i] == DType.LIST) {
+              Schema listSchema = schema.getChild(i);
+              Schema elementSchema = listSchema.getChild(0);
+              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else if (types[i] == DType.STRUCT) {
+              Schema structSchema = schema.getChild(i);
+              int numStructChildren = structSchema.getNumChildren();
+              DataType[] structChildrenTypes = new DataType[numStructChildren];
+              for (int j = 0; j < numStructChildren; j++) {
+                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
+              }
+              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else {
+              try (Scalar s = Scalar.fromNull(types[i])) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
             }
           }
         }
@@ -3747,34 +3747,6 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left semi join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftSemiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left semi join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3804,42 +3776,6 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left semi join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left semi join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -3919,34 +3855,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left anti join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftAntiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3976,42 +3884,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left anti join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left anti join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
new file mode 100644
index 00000000000..72dfcdb3cb5
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+/** Multi-buffer compressor */
+public abstract class BatchedCompressor {
+
+  static final long MAX_CHUNK_SIZE = 16777216;  // 16MiB in bytes
+  // each chunk has a 64-bit integer value as metadata containing the compressed size
+  static final long METADATA_BYTES_PER_CHUNK = 8;
+
+  private final long chunkSize;
+  private final long maxIntermediateBufferSize;
+  private final long maxOutputChunkSize;
+
+  /**
+   * Construct a batched compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device
+   *                                  buffers used during compression.
+   */
+  public BatchedCompressor(long chunkSize, long maxOutputChunkSize,
+      long maxIntermediateBufferSize) {
+    validateChunkSize(chunkSize);
+    assert maxOutputChunkSize < Integer.MAX_VALUE;
+    this.chunkSize = chunkSize;
+    this.maxOutputChunkSize = maxOutputChunkSize;
+    this.maxIntermediateBufferSize = Math.max(maxOutputChunkSize, maxIntermediateBufferSize);
+  }
+
+  /**
+   * Compress a batch of buffers. The input buffers will be closed.
+   * @param origInputs buffers to compress
+   * @param stream CUDA stream to use
+   * @return compressed buffers corresponding to the input buffers
+   */
+  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
+      if (chunkSize <= 0) {
+        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return new DeviceMemoryBuffer[0];
+      }
+
+      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
+      // chunks are needed for each input buffer.
+      int[] chunksPerInput = new int[numInputs];
+      int numChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        BaseDeviceMemoryBuffer buffer = inputs.get(i);
+        int numBufferChunks = getNumChunksInBuffer(buffer);
+        chunksPerInput[i] = numBufferChunks;
+        numChunks += numBufferChunks;
+      }
+
+      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
+      // chunk destination addresses, and sizes.
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+              allocCompressedBuffers(numChunks, stream);
+           DeviceMemoryBuffer compressedChunkSizes =
+              DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
+        long[] inputChunkAddrs = new long[numChunks];
+        long[] inputChunkSizes = new long[numChunks];
+        long[] outputChunkAddrs = new long[numChunks];
+        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes, compressedBuffers,
+            outputChunkAddrs);
+
+        final long tempBufferSize = batchedCompressGetTempSize(numChunks, chunkSize);
+        try (DeviceMemoryBuffer addrsAndSizes = putAddrsAndSizesOnDevice(inputChunkAddrs,
+                inputChunkSizes, outputChunkAddrs, stream);
+             DeviceMemoryBuffer tempBuffer =
+                DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
+          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
+          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
+          batchedCompressAsync(addrsAndSizes.getAddress(), devInputSizesPtr, chunkSize,
+              numChunks, tempBuffer.getAddress(), tempBufferSize, devOutputAddrsPtr,
+              compressedChunkSizes.getAddress(), stream.getStream());
+        }
+
+        // Synchronously copy the resulting compressed sizes per chunk.
+        long[] outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
+
+        // inputs are no longer needed at this point, so free them early
+        inputs.close();
+
+        // Combine compressed chunks into output buffers corresponding to each original input
+        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
+            outputChunkSizes, stream);
+      }
+    }
+  }
+
+  static void validateChunkSize(long chunkSize) {
+    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
+      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize +
+          " Max chunk size is: " + MAX_CHUNK_SIZE + " bytes");
+    }
+  }
+
+  private static long ceilingDivide(long x, long y) {
+    return (x + y - 1) / y;
+  }
+
+  private int getNumChunksInBuffer(MemoryBuffer buffer) {
+    return (int) ceilingDivide(buffer.getLength(), chunkSize);
+  }
+
+  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
+      Cuda.Stream stream) {
+    final long chunksPerBuffer = maxIntermediateBufferSize / maxOutputChunkSize;
+    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
+    if (numBuffers > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Too many chunks");
+    }
+    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
+      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
+          new DeviceMemoryBuffer[(int) numBuffers]);
+      try {
+        // allocate all of the max-chunks intermediate compressed buffers
+        for (int i = 0; i < buffers.size() - 1; ++i) {
+          buffers.set(i,
+              DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
+        }
+        // allocate the tail intermediate compressed buffer that may be smaller than the others
+        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
+            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
+        return buffers;
+      } catch (Exception e) {
+        buffers.close(e);
+        throw e;
+      }
+    }
+  }
+
+  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
+  // into the chunks in the input and output buffers.
+  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
+      long[] inputChunkAddrs, long[] inputChunkSizes,
+      CloseableArray<DeviceMemoryBuffer> compressedBuffers, long[] outputChunkAddrs) {
+    // setup the input addresses and sizes
+    int chunkIdx = 0;
+    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
+      final int numChunksInBuffer = getNumChunksInBuffer(input);
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
+        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
+            : (input.getLength() - (long) i * chunkSize);
+        ++chunkIdx;
+      }
+    }
+    assert chunkIdx == inputChunkAddrs.length;
+    assert chunkIdx == inputChunkSizes.length;
+
+    // setup output addresses
+    chunkIdx = 0;
+    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
+      assert buffer.getLength() % maxOutputChunkSize == 0;
+      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
+      long baseAddr = buffer.getAddress();
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
+      }
+    }
+    assert chunkIdx == outputChunkAddrs.length;
+  }
+
+  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
+  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs, long[] inputSizes,
+        long[] outputAddrs, Cuda.Stream stream) {
+    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
+    final long outputAddrsOffset = inputAddrs.length * 8L;
+    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
+    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
+        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
+        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
+        for (int i = 0; i < inputSizes.length; i++) {
+          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
+        }
+        result.copyFromHostBuffer(hostbuf, stream);
+        result.incRefCount();
+        return result;
+      }
+    }
+  }
+
+  // Synchronously copy the resulting compressed sizes from device memory to host memory.
+  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
+        int numChunks = (int) (devChunkSizes.getLength() / 8);
+        long[] result = new long[numChunks];
+        for (int i = 0; i < numChunks; i++) {
+          long size = hostbuf.getLong(i * 8L);
+          assert size < Integer.MAX_VALUE : "output size is too big";
+          result[i] = size;
+        }
+        return result;
+      }
+    }
+  }
+
+  // Stitch together the individual chunks into the result buffers.
+  // Each result buffer has metadata at the beginning, followed by compressed chunks.
+  // This is done by building up parallel lists of source addr, dest addr and size and
+  // then calling multiBufferCopyAsync()
+  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
+        DeviceMemoryBuffer compressedChunkSizes, long[] outputChunkAddrs,
+        long[] outputChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
+      final int numOutputs = chunksPerInput.length;
+      final long chunkSizesAddr = compressedChunkSizes.getAddress();
+      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
+      try (CloseableArray<DeviceMemoryBuffer> outputs =
+              CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
+        // Each chunk needs to be copied, and each output needs a copy of the
+        // compressed chunk size vector representing the metadata.
+        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
+        long[] destAddrs = new long[totalBuffersToCopy];
+        long[] srcAddrs = new long[totalBuffersToCopy];
+        long[] sizes = new long[totalBuffersToCopy];
+        int copyBufferIdx = 0;
+        int chunkIdx = 0;
+        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
+          DeviceMemoryBuffer outputBuffer =
+              DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
+          outputs.set(outputIdx, outputBuffer);
+          final long outputBufferAddr = outputBuffer.getAddress();
+          final long numChunks = chunksPerInput[outputIdx];
+          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
+
+          // setup a copy of the metadata at the front of the output buffer
+          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
+          destAddrs[copyBufferIdx] = outputBufferAddr;
+          sizes[copyBufferIdx] = metadataSize;
+          ++copyBufferIdx;
+
+          // setup copies of the compressed chunks for this output buffer
+          long nextChunkAddr = outputBufferAddr + metadataSize;
+          for (int i = 0; i < numChunks; ++i) {
+            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
+            destAddrs[copyBufferIdx] = nextChunkAddr;
+            final long chunkSize = outputChunkSizes[chunkIdx];
+            sizes[copyBufferIdx] = chunkSize;
+            copyBufferIdx++;
+            chunkIdx++;
+            nextChunkAddr += chunkSize;
+          }
+        }
+        assert copyBufferIdx == totalBuffersToCopy;
+        assert chunkIdx == outputChunkAddrs.length;
+        assert chunkIdx == outputChunkSizes.length;
+
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        return outputs.release();
+      }
+    }
+  }
+
+  // Calculate the sizes for each output buffer (metadata plus size of compressed chunks)
+  private long[] calcOutputBufferSizes(int[] chunksPerInput, long[] outputChunkSizes) {
+    long[] sizes = new long[chunksPerInput.length];
+    int chunkIdx = 0;
+    for (int i = 0; i < sizes.length; i++) {
+      final int chunksInBuffer = chunksPerInput[i];
+      final int chunkEndIdx = chunkIdx + chunksInBuffer;
+      // metadata stored in front of compressed data
+      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
+      // add in the compressed chunk sizes to get the total size
+      while (chunkIdx < chunkEndIdx) {
+        bufferSize += outputChunkSizes[chunkIdx++];
+      }
+      sizes[i] = bufferSize;
+    }
+    assert chunkIdx == outputChunkSizes.length;
+    return sizes;
+  }
+
+  /**
+   * Get the temporary workspace size required to perform compression of an entire batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  protected abstract long batchedCompressGetTempSize(long batchSize, long maxChunkSize);
+
+   /**
+   * Asynchronously compress a batch of buffers. Note that compressedSizesOutPtr must
+   * point to pinned memory for this operation to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long compressedSizesOutPtr,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
new file mode 100644
index 00000000000..5543d2dcb64
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
+
+/** Decompressor that operates on multiple input buffers in a batch */
+public abstract class BatchedDecompressor {
+
+  private final long chunkSize;
+
+  /**
+   * Construct a batched decompressor instance
+   * @param chunkSize maximum uncompressed block size, must match value used
+   *                  during compression
+   */
+  public BatchedDecompressor(long chunkSize) {
+    this.chunkSize = chunkSize;
+  }
+
+  /**
+   * Asynchronously decompress a batch of buffers
+   * @param origInputs buffers to decompress, will be closed by this operation
+   * @param outputs output buffers that will contain the decompressed results, each must
+   *                be sized to the exact decompressed size of the corresponding input
+   * @param stream CUDA stream to use
+   */
+  public void decompressAsync(BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
+            CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
+      BatchedCompressor.validateChunkSize(chunkSize);
+      if (origInputs.length != outputs.length) {
+        throw new IllegalArgumentException("number of inputs must match number of outputs");
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return;
+      }
+
+      int[] chunksPerInput = new int[numInputs];
+      long totalChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        // use output size to determine number of chunks in the input, as the output buffer
+        // must be exactly sized to the uncompressed data
+        BaseDeviceMemoryBuffer buffer = outputs[i];
+        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
+        chunksPerInput[i] = numBufferChunks;
+        totalChunks += numBufferChunks;
+      }
+
+      final long tempBufferSize = batchedDecompressGetTempSize(totalChunks, chunkSize);
+      try (DeviceMemoryBuffer devAddrsSizes = buildAddrsSizesBuffer(chunkSize, totalChunks,
+              inputs.getArray(), chunksPerInput, outputs, stream);
+           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
+        // buffer containing addresses and sizes contains four vectors of longs in this order:
+        // - compressed chunk input addresses
+        // - chunk output buffer addresses
+        // - compressed chunk sizes
+        // - uncompressed chunk sizes
+        final long inputAddrsPtr = devAddrsSizes.getAddress();
+        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
+        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
+        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
+        batchedDecompressAsync(inputAddrsPtr, inputSizesPtr, outputSizesPtr, totalChunks,
+            devTemp.getAddress(), devTemp.getLength(), outputAddrsPtr, stream.getStream());
+      }
+    }
+  }
+
+  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
+    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  }
+
+  /**
+   * Build a device memory buffer containing four vectors of longs in the following order:
+   * <ul>
+   *   <li>compressed chunk input addresses</li>
+   *   <li>uncompressed chunk output addresses</li>
+   *   <li>compressed chunk sizes</li>
+   *   <li>uncompressed chunk sizes</li>
+   * </ul>
+   * Each vector contains as many longs as the number of chunks being decompressed
+   * @param chunkSize maximum uncompressed size of a chunk
+   * @param totalChunks total number of chunks to be decompressed
+   * @param inputs device buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks per input buffer
+   * @param outputs device buffers that will hold the uncompressed output
+   * @param stream CUDA stream to use
+   * @return device buffer containing address and size vectors
+   */
+  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize, long totalChunks,
+      BaseDeviceMemoryBuffer[] inputs, int[] chunksPerInput, BaseDeviceMemoryBuffer[] outputs,
+      Cuda.Stream stream) {
+    final long totalBufferSize = totalChunks * 8L * 4L;
+    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
+           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
+           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
+        // Build four long vectors in the AddrsSizes buffer:
+        // - compressed input address (one per chunk)
+        // - uncompressed output address (one per chunk)
+        // - compressed input size (one per chunk)
+        // - uncompressed input size (one per chunk)
+        final long srcAddrsOffset = 0;
+        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
+        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
+        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
+        long chunkIdx = 0;
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
+          final int numChunksInInput = chunksPerInput[inputIdx];
+          long srcAddr = input.getAddress() +
+              BatchedCompressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
+          long destAddr = output.getAddress();
+          final long chunkIdxEnd = chunkIdx + numChunksInInput;
+          while (chunkIdx < chunkIdxEnd) {
+            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
+            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
+                : output.getAddress() + output.getLength() - destAddr;
+            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
+            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
+            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
+            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
+            srcAddr += srcChunkSize;
+            destAddr += destChunkSize;
+            ++chunkIdx;
+          }
+        }
+        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
+        devAddrsSizes.incRefCount();
+        return devAddrsSizes;
+      }
+    }
+  }
+
+  /**
+   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
+   * @param totalChunks total number of compressed chunks
+   * @param inputs buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks for the corresponding input
+   * @param stream CUDA stream to use
+   * @return host buffer containing all of the metadata
+   */
+  private static HostMemoryBuffer fetchMetadata(long totalChunks, BaseDeviceMemoryBuffer[] inputs,
+      int[] chunksPerInput, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
+      // one long per chunk containing the compressed size
+      final long totalMetadataSize = totalChunks * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+      // Build corresponding vectors of destination addresses, source addresses and sizes.
+      long[] destAddrs = new long[inputs.length];
+      long[] srcAddrs = new long[inputs.length];
+      long[] sizes = new long[inputs.length];
+      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
+           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
+        long destCopyAddr = devMetadata.getAddress();
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final long copySize =
+              chunksPerInput[inputIdx] * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+          destAddrs[inputIdx] = destCopyAddr;
+          srcAddrs[inputIdx] = input.getAddress();
+          sizes[inputIdx] = copySize;
+          destCopyAddr += copySize;
+        }
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
+        hostMetadata.incRefCount();
+        return hostMetadata;
+      }
+    }
+  }
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  protected abstract long batchedDecompressGetTempSize(long numChunks,
+      long maxUncompressedChunkBytes);
+
+    /**
+   * Asynchronously decompress a batch of compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedDecompressAsync(long devInPtrs, long devInSizes,
+      long devOutSizes, long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 1aa7e5e11a0..58c0e7ee169 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,305 +16,31 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.CloseableArray;
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.DefaultHostMemoryAllocator;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryAllocator;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.MemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 /** Multi-buffer LZ4 compressor */
-public class BatchedLZ4Compressor {
-  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
-
-  static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
-  // each chunk has a 64-bit integer value as metadata containing the compressed size
-  static final long METADATA_BYTES_PER_CHUNK = 8;
-
-  private final long chunkSize;
-  private final long targetIntermediateBufferSize;
-  private final long maxOutputChunkSize;
+public class BatchedLZ4Compressor extends BatchedCompressor {
 
   /**
    * Construct a batched LZ4 compressor instance
-   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs
-   *                  larger than this will be compressed in multiple chunks.
-   * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers
-   *                                     used during compression.
-   */
-  public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) {
-    validateChunkSize(chunkSize);
-    this.chunkSize = chunkSize;
-    this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize);
-    assert maxOutputChunkSize < Integer.MAX_VALUE;
-    this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize);
-  }
-
-  /**
-   * Compress a batch of buffers with LZ4. The input buffers will be closed.
-   * @param origInputs buffers to compress
-   * @param stream CUDA stream to use
-   * @return compressed buffers corresponding to the input buffers
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
    */
-  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
-      if (chunkSize <= 0) {
-        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return new DeviceMemoryBuffer[0];
-      }
-
-      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
-      // chunks are needed for each input buffer.
-      int[] chunksPerInput = new int[numInputs];
-      int numChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        BaseDeviceMemoryBuffer buffer = inputs.get(i);
-        int numBufferChunks = getNumChunksInBuffer(buffer);
-        chunksPerInput[i] = numBufferChunks;
-        numChunks += numBufferChunks;
-      }
-
-      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
-      // chunk destination addresses, and sizes.
-      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               allocCompressedBuffers(numChunks, stream);
-           DeviceMemoryBuffer compressedChunkSizes =
-               DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
-        long[] inputChunkAddrs = new long[numChunks];
-        long[] inputChunkSizes = new long[numChunks];
-        long[] outputChunkAddrs = new long[numChunks];
-        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes,
-            compressedBuffers, outputChunkAddrs);
-
-        long[] outputChunkSizes;
-        final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize);
-        try (DeviceMemoryBuffer addrsAndSizes =
-                 putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream);
-             DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
-          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
-          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
-          NvcompJni.batchedLZ4CompressAsync(
-              addrsAndSizes.getAddress(),
-              devInputSizesPtr,
-              chunkSize,
-              numChunks,
-              tempBuffer.getAddress(),
-              tempBufferSize,
-              devOutputAddrsPtr,
-              compressedChunkSizes.getAddress(),
-              stream.getStream());
-        }
-
-        // Synchronously copy the resulting compressed sizes per chunk.
-        outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
-
-        // inputs are no longer needed at this point, so free them early
-        inputs.close();
-
-        // Combine compressed chunks into output buffers corresponding to each original input
-        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
-            outputChunkSizes, stream);
-      }
-    }
-  }
-
-  static void validateChunkSize(long chunkSize) {
-    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
-      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: "
-          + MAX_CHUNK_SIZE + " bytes");
-    }
+  public BatchedLZ4Compressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
   }
 
-  private static long ceilingDivide(long x, long y) {
-    return (x + y - 1) / y;
-  }
-
-  private int getNumChunksInBuffer(MemoryBuffer buffer) {
-    return (int) ceilingDivide(buffer.getLength(), chunkSize);
-  }
-
-  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
-                                                                    Cuda.Stream stream) {
-    final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize;
-    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
-    if (numBuffers > Integer.MAX_VALUE) {
-      throw new IllegalStateException("Too many chunks");
-    }
-    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
-      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
-          new DeviceMemoryBuffer[(int) numBuffers]);
-      try {
-        // allocate all of the max-chunks intermediate compressed buffers
-        for (int i = 0; i < buffers.size() - 1; ++i) {
-          buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
-        }
-        // allocate the tail intermediate compressed buffer that may be smaller than the others
-        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
-            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
-        return buffers;
-      } catch (Exception e) {
-        buffers.close(e);
-        throw e;
-      }
-    }
-  }
-
-  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
-  // into the chunks in the input and output buffers.
-  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
-                                  long[] inputChunkAddrs,
-                                  long[] inputChunkSizes,
-                                  CloseableArray<DeviceMemoryBuffer> compressedBuffers,
-                                  long[] outputChunkAddrs) {
-    // setup the input addresses and sizes
-    int chunkIdx = 0;
-    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
-      final int numChunksInBuffer = getNumChunksInBuffer(input);
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
-        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
-            : (input.getLength() - (long) i * chunkSize);
-        ++chunkIdx;
-      }
-    }
-    assert chunkIdx == inputChunkAddrs.length;
-    assert chunkIdx == inputChunkSizes.length;
-
-    // setup output addresses
-    chunkIdx = 0;
-    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
-      assert buffer.getLength() % maxOutputChunkSize == 0;
-      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
-      long baseAddr = buffer.getAddress();
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
-      }
-    }
-    assert chunkIdx == outputChunkAddrs.length;
-  }
-
-  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
-  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
-                                                      long[] inputSizes,
-                                                      long[] outputAddrs,
-                                                      Cuda.Stream stream) {
-    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
-    final long outputAddrsOffset = inputAddrs.length * 8L;
-    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
-    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(totalSize);
-           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
-        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
-        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
-        for (int i = 0; i < inputSizes.length; i++) {
-          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
-        }
-        result.copyFromHostBuffer(hostbuf, stream);
-        result.incRefCount();
-        return result;
-      }
-    }
-  }
-
-  // Synchronously copy the resulting compressed sizes from device memory to host memory.
-  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(devChunkSizes.getLength())) {
-        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
-        int numChunks = (int) (devChunkSizes.getLength() / 8);
-        long[] result = new long[numChunks];
-        for (int i = 0; i < numChunks; i++) {
-          long size = hostbuf.getLong(i * 8L);
-          assert size < Integer.MAX_VALUE : "output size is too big";
-          result[i] = size;
-        }
-        return result;
-      }
-    }
-  }
-
-  // Stitch together the individual chunks into the result buffers.
-  // Each result buffer has metadata at the beginning, followed by compressed chunks.
-  // This is done by building up parallel lists of source addr, dest addr and size and
-  // then calling multiBufferCopyAsync()
-  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
-                                            DeviceMemoryBuffer compressedChunkSizes,
-                                            long[] outputChunkAddrs,
-                                            long[] outputChunkSizes,
-                                            Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
-      final int numOutputs = chunksPerInput.length;
-      final long chunkSizesAddr = compressedChunkSizes.getAddress();
-      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
-      try (CloseableArray<DeviceMemoryBuffer> outputs =
-               CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
-        // Each chunk needs to be copied, and each output needs a copy of the
-        // compressed chunk size vector representing the metadata.
-        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
-        long[] destAddrs = new long[totalBuffersToCopy];
-        long[] srcAddrs = new long[totalBuffersToCopy];
-        long[] sizes = new long[totalBuffersToCopy];
-        int copyBufferIdx = 0;
-        int chunkIdx = 0;
-        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
-          DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
-          final long outputBufferAddr = outputBuffer.getAddress();
-          outputs.set(outputIdx, outputBuffer);
-          final long numChunks = chunksPerInput[outputIdx];
-          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
-
-          // setup a copy of the metadata at the front of the output buffer
-          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
-          destAddrs[copyBufferIdx] = outputBufferAddr;
-          sizes[copyBufferIdx] = metadataSize;
-          ++copyBufferIdx;
-
-          // setup copies of the compressed chunks for this output buffer
-          long nextChunkAddr = outputBufferAddr + metadataSize;
-          for (int i = 0; i < numChunks; ++i) {
-            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
-            destAddrs[copyBufferIdx] = nextChunkAddr;
-            final long chunkSize = outputChunkSizes[chunkIdx];
-            sizes[copyBufferIdx] = chunkSize;
-            copyBufferIdx++;
-            chunkIdx++;
-            nextChunkAddr += chunkSize;
-          }
-        }
-        assert copyBufferIdx == totalBuffersToCopy;
-        assert chunkIdx == outputChunkAddrs.length;
-        assert chunkIdx == outputChunkSizes.length;
-
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        return outputs.release();
-      }
-    }
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedLZ4CompressGetTempSize(batchSize, maxChunkSize);
   }
 
-  // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks)
-  private long[] calcOutputBufferSizes(int[] chunksPerInput,
-                                       long[] outputChunkSizes) {
-    long[] sizes = new long[chunksPerInput.length];
-    int chunkIdx = 0;
-    for (int i = 0; i < sizes.length; i++) {
-      final int chunksInBuffer = chunksPerInput[i];
-      final int chunkEndIdx = chunkIdx + chunksInBuffer;
-      // metadata stored in front of compressed data
-      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
-      // add in the compressed chunk sizes to get the total size
-      while (chunkIdx < chunkEndIdx) {
-        bufferSize += outputChunkSizes[chunkIdx++];
-      }
-      sizes[i] = bufferSize;
-    }
-    assert chunkIdx == outputChunkSizes.length;
-    return sizes;
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedLZ4CompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
index 40ad4d5e9ed..d78d537ea13 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,18 +16,15 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
-import java.util.Arrays;
 
 /** LZ4 decompressor that operates on multiple input buffers in a batch */
-public class BatchedLZ4Decompressor {
+public class BatchedLZ4Decompressor extends BatchedDecompressor {
+  public BatchedLZ4Decompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
   /**
    * Asynchronously decompress a batch of buffers
    * @param chunkSize maximum uncompressed block size, must match value used during compression
@@ -35,165 +32,24 @@ public class BatchedLZ4Decompressor {
    * @param outputs output buffers that will contain the compressed results, each must be sized
    *                to the exact decompressed size of the corresponding input
    * @param stream CUDA stream to use
+   *
+   * Deprecated: Use the non-static version in the parent class instead.
    */
-  public static void decompressAsync(long chunkSize,
-                                     BaseDeviceMemoryBuffer[] origInputs,
-                                     BaseDeviceMemoryBuffer[] outputs,
-                                     Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
-             CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
-      BatchedLZ4Compressor.validateChunkSize(chunkSize);
-      if (origInputs.length != outputs.length) {
-        throw new IllegalArgumentException("number of inputs must match number of outputs");
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return;
-      }
-
-      int[] chunksPerInput = new int[numInputs];
-      long totalChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        // use output size to determine number of chunks in the input, as the output buffer
-        // must be exactly sized to the uncompressed data
-        BaseDeviceMemoryBuffer buffer = outputs[i];
-        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
-        chunksPerInput[i] = numBufferChunks;
-        totalChunks += numBufferChunks;
-      }
-
-      final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize);
-      try (DeviceMemoryBuffer devAddrsSizes =
-               buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput,
-                   outputs, stream);
-           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
-        // buffer containing addresses and sizes contains four vectors of longs in this order:
-        // - compressed chunk input addresses
-        // - chunk output buffer addresses
-        // - compressed chunk sizes
-        // - uncompressed chunk sizes
-        final long inputAddrsPtr = devAddrsSizes.getAddress();
-        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
-        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
-        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
-        NvcompJni.batchedLZ4DecompressAsync(
-            inputAddrsPtr,
-            inputSizesPtr,
-            outputSizesPtr,
-            totalChunks,
-            devTemp.getAddress(),
-            devTemp.getLength(),
-            outputAddrsPtr,
-            stream.getStream());
-      }
-    }
+  public static void decompressAsync(long chunkSize, BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    new BatchedLZ4Decompressor(chunkSize).decompressAsync(origInputs, outputs, stream);
   }
 
-  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
-    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedLZ4DecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
   }
 
-  /**
-   * Build a device memory buffer containing four vectors of longs in the following order:
-   * <ul>
-   *   <li>compressed chunk input addresses</li>
-   *   <li>uncompressed chunk output addresses</li>
-   *   <li>compressed chunk sizes</li>
-   *   <li>uncompressed chunk sizes</li>
-   * </ul>
-   * Each vector contains as many longs as the number of chunks being decompressed
-   * @param chunkSize maximum uncompressed size of a chunk
-   * @param totalChunks total number of chunks to be decompressed
-   * @param inputs device buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks per input buffer
-   * @param outputs device buffers that will hold the uncompressed output
-   * @param stream CUDA stream to use
-   * @return device buffer containing address and size vectors
-   */
-  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize,
-                                                          long totalChunks,
-                                                          BaseDeviceMemoryBuffer[] inputs,
-                                                          int[] chunksPerInput,
-                                                          BaseDeviceMemoryBuffer[] outputs,
-                                                          Cuda.Stream stream) {
-    final long totalBufferSize = totalChunks * 8L * 4L;
-    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
-           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
-           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
-        // Build four long vectors in the AddrsSizes buffer:
-        // - compressed input address (one per chunk)
-        // - uncompressed output address (one per chunk)
-        // - compressed input size (one per chunk)
-        // - uncompressed input size (one per chunk)
-        final long srcAddrsOffset = 0;
-        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
-        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
-        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
-        long chunkIdx = 0;
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
-          final int numChunksInInput = chunksPerInput[inputIdx];
-          long srcAddr = input.getAddress() +
-              BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
-          long destAddr = output.getAddress();
-          final long chunkIdxEnd = chunkIdx + numChunksInInput;
-          while (chunkIdx < chunkIdxEnd) {
-            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
-            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
-                : output.getAddress() + output.getLength() - destAddr;
-            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
-            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
-            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
-            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
-            srcAddr += srcChunkSize;
-            destAddr += destChunkSize;
-            ++chunkIdx;
-          }
-        }
-        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
-        devAddrsSizes.incRefCount();
-        return devAddrsSizes;
-      }
-    }
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedLZ4DecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
   }
 
-  /**
-   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
-   * @param totalChunks total number of compressed chunks
-   * @param inputs buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks for the corresponding input
-   * @param stream CUDA stream to use
-   * @return host buffer containing all of the metadata
-   */
-  private static HostMemoryBuffer fetchMetadata(long totalChunks,
-                                                BaseDeviceMemoryBuffer[] inputs,
-                                                int[] chunksPerInput,
-                                                Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
-      // one long per chunk containing the compressed size
-      final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-      // Build corresponding vectors of destination addresses, source addresses and sizes.
-      long[] destAddrs = new long[inputs.length];
-      long[] srcAddrs = new long[inputs.length];
-      long[] sizes = new long[inputs.length];
-      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
-           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
-        long destCopyAddr = devMetadata.getAddress();
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-          destAddrs[inputIdx] = destCopyAddr;
-          srcAddrs[inputIdx] = input.getAddress();
-          sizes[inputIdx] = copySize;
-          destCopyAddr += copySize;
-        }
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
-        hostMetadata.incRefCount();
-        return hostMetadata;
-      }
-    }
-  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
new file mode 100644
index 00000000000..0532b4aa86d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** Multi-buffer ZSTD compressor */
+public class BatchedZstdCompressor extends BatchedCompressor {
+  /**
+   * Construct a batched ZSTD compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
+   */
+  public BatchedZstdCompressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedZstdCompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
+  }
+
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedZstdCompressGetTempSize(batchSize, maxChunkSize);
+  }
+
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedZstdCompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
new file mode 100644
index 00000000000..ba11a236834
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** ZSTD decompressor that operates on multiple input buffers in a batch */
+public class BatchedZstdDecompressor extends BatchedDecompressor {
+  public BatchedZstdDecompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedZstdDecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
+  }
+
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedZstdDecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
+  }
+
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 57094008c08..1a21629a208 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
+  // For lz4
   /**
    * Get the temporary workspace size required to perform compression of entire LZ4 batch.
    * @param batchSize number of chunks in the batch
@@ -114,4 +115,97 @@ static native void batchedLZ4GetDecompressSizeAsync(
       long devOutSizes,
       long batchSize,
       long stream);
+
+  // For zstd
+  /**
+   * Get the temporary workspace size required to perform compression of entire zstd batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  static native long batchedZstdCompressGetTempSize(long batchSize, long maxChunkSize);
+
+  /**
+   * Get the maximum size any chunk could compress to in a ZSTD batch. This is the minimum
+   * amount of output memory to allocate per chunk when batch compressing.
+   * @param maxChunkSize maximum size of an uncompressed chunk size in bytes
+   * @return maximum compressed output size of a chunk
+   */
+  static native long batchedZstdCompressGetMaxOutputChunkSize(long maxChunkSize);
+
+  /**
+   * Asynchronously compress a batch of buffers with ZSTD. Note that
+   * compressedSizesOutPtr must point to pinned memory for this operation
+   * to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdCompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long chunkSize,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long compressedSizesOutPtr,
+      long stream);
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a
+   * ZSTD-compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  static native long batchedZstdDecompressGetTempSize(
+      long numChunks,
+      long maxUncompressedChunkBytes);
+
+  /**
+   * Asynchronously decompress a batch of ZSTD-compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdDecompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long stream);
+
+  /**
+   * Asynchronously calculates the decompressed size needed for each chunk.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of calculated decompress sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdGetDecompressSizeAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long stream);
 }
diff --git a/java/src/main/native/.clang-format b/java/src/main/native/.clang-format
deleted file mode 100644
index e0866533a36..00000000000
--- a/java/src/main/native/.clang-format
+++ /dev/null
@@ -1,204 +0,0 @@
----
-# Reference: https://clang.llvm.org/docs/ClangFormatStyleOptions.html
-Language:        Cpp
-# BasedOnStyle:  LLVM
-# no indentation (-2 from indent, which is 2)
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-# int aaaa = 12;
-# int b    = 23;
-# int ccc  = 23;
-# leaving OFF
-AlignConsecutiveAssignments: false
-# int         aaaa = 12;
-# float       b = 23;
-# std::string ccc = 23;
-# leaving OFF
-AlignConsecutiveDeclarations: false
-##define A                                                                      \
-#  int aaaa;                                                                    \
-#  int b;                                                                       \
-#  int dddddddddd;
-# leaving ON
-AlignEscapedNewlines: Right
-# int aaa = bbbbbbbbbbbbbbb +
-#           ccccccccccccccc;
-# leaving ON
-AlignOperands:   true
-# true:                                   false:
-# int a;     // My comment a      vs.     int a; // My comment a
-# int b = 2; // comment  b                int b = 2; // comment about b
-# leaving ON
-AlignTrailingComments: true
-# squeezes a long declaration's arguments to the next line:
-#true:
-#void myFunction(
-#	int a, int b, int c, int d, int e);
-#
-#false:
-#void myFunction(int a,
-#				int b,
-#				int c,
-#				int d,
-#				int e);
-# leaving ON
-AllowAllParametersOfDeclarationOnNextLine: true
-# changed to ON, as we use short blocks on same lines
-AllowShortBlocksOnASingleLine: true
-# set this to ON, we use this in a few places
-AllowShortCaseLabelsOnASingleLine: true
-# set this to ON
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-# Deprecated option.
-# PenaltyReturnTypeOnItsOwnLine applies, as we set this to None,
-# where it tries to break after the return type automatically
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-
-# if all the arguments for a function don't fit in a single line,
-# with a value of "false", it'll split each argument into different lines
-BinPackArguments: true
-BinPackParameters: true
-
-# if this is set to Custom, the BraceWrapping flags apply
-BreakBeforeBraces: Custom
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-
-# will break after operators when a line is too long
-BreakBeforeBinaryOperators: None
-# not in docs.. so that's nice
-BreakBeforeInheritanceComma: false
-# This will break inheritance list and align on colon,
-# it also places each inherited class in a different line.
-# Leaving ON
-BreakInheritanceList: BeforeColon
-
-#
-#true:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription
-#	? firstValue
-#	: SecondValueVeryVeryVeryVeryLong;
-#
-#false:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription ?
-#	firstValue :
-#	SecondValueVeryVeryVeryVeryLong;
-BreakBeforeTernaryOperators: false
-
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: true
-BreakStringLiterals: true
-# So the line lengths in cudf are not following a limit, at the moment.
-# Usually it's a long comment that makes the line length inconsistent.
-# Command I used to find max line lengths (from cpp directory):
-#   find include src tests|grep "\." |xargs -I{} bash -c "awk '{print length}' {} | sort -rn | head -1"|sort -n
-# I picked 100, as it seemed somewhere around median
-ColumnLimit:     100
-# TODO: not aware of any of these at this time
-CommentPragmas:  '^ IWYU pragma:'
-# So it doesn't put subsequent namespaces in the same line
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-# TODO: adds spaces around the element list
-# in initializer: vector<T> x{ {}, ..., {} }
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-# } // namespace a => useful
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '<[[:alnum:]]+>'
-    Priority:        0
-  - Regex:           '<[[:alnum:].]+>'
-    Priority:        1
-  - Regex:           '<.*>'
-    Priority:        2
-  - Regex:           '.*/.*'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        4
-# if a header matches this in an include group, it will be moved up to the
-# top of the group.
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-
-# Penalties: leaving unchanged for now
-# https://stackoverflow.com/questions/26635370/in-clang-format-what-do-the-penalties-do
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-# As currently set, we don't see return types being
-# left on their own line, leaving at 60
-PenaltyReturnTypeOnItsOwnLine: 60
-
-# char* foo vs char *foo, picking Right aligned
-PointerAlignment: Right
-ReflowComments:  true
-# leaving ON, but this could be something to turn OFF
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never
-...
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0d5339a1402..56f8f9d0472 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -239,7 +239,7 @@ endif()
 # When nvcomp is installed we need to use nvcomp::nvcomp but from the cudf build directory it will
 # just be nvcomp.
 target_link_libraries(
-  cudfjni ${CUDF_LINK} PRIVATE nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
+  cudfjni ${CUDF_LINK} PRIVATE nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
                                $<TARGET_NAME_IF_EXISTS:nvcomp::nvcomp>
 )
 
diff --git a/java/src/main/native/clang-format.README b/java/src/main/native/clang-format.README
deleted file mode 100644
index 6c13289720a..00000000000
--- a/java/src/main/native/clang-format.README
+++ /dev/null
@@ -1,13 +0,0 @@
-README
-======
-
-To apply code formatting to a file you are working on, currently you can do this manually using
-clang-format-7:
-
-This will edit the file, and print to stdout:
-
-  clang-format [file]
-
-This will edit the file in place, do this if you are sure of what you are doing:
-
-  clang-format -i [file]
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index f342fca8933..96ad1f23b8c 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,48 +15,48 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <memory>
-#include <vector>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/detail/error.hpp>
 
 #include <jni.h>
 
-#include <cudf/utilities/error.hpp>
-#include <rmm/detail/error.hpp>
+#include <algorithm>
+#include <memory>
+#include <vector>
 
 namespace cudf {
 namespace jni {
 
 constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 
-constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
-constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
-constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
-constexpr char const *CUDF_DTYPE_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *NPE_CLASS = "java/lang/NullPointerException";
-constexpr char const *OOM_CLASS = "java/lang/OutOfMemoryError";
+constexpr char const* CUDA_ERROR_CLASS          = "ai/rapids/cudf/CudaException";
+constexpr char const* CUDA_FATAL_ERROR_CLASS    = "ai/rapids/cudf/CudaFatalException";
+constexpr char const* CUDF_ERROR_CLASS          = "ai/rapids/cudf/CudfException";
+constexpr char const* CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
+constexpr char const* CUDF_DTYPE_ERROR_CLASS    = "ai/rapids/cudf/CudfException";
+constexpr char const* INDEX_OOB_CLASS           = "java/lang/ArrayIndexOutOfBoundsException";
+constexpr char const* ILLEGAL_ARG_CLASS         = "java/lang/IllegalArgumentException";
+constexpr char const* NPE_CLASS                 = "java/lang/NullPointerException";
+constexpr char const* OOM_CLASS                 = "java/lang/OutOfMemoryError";
 
 /**
  * @brief indicates that a JNI error of some kind was thrown and the main
  * function should return.
  */
 class jni_exception : public std::runtime_error {
-public:
-  jni_exception(char const *const message) : std::runtime_error(message) {}
-  jni_exception(std::string const &message) : std::runtime_error(message) {}
+ public:
+  jni_exception(char const* const message) : std::runtime_error(message) {}
+  jni_exception(std::string const& message) : std::runtime_error(message) {}
 };
 
 /**
  * @brief throw a java exception and a C++ one for flow control.
  */
-inline void throw_java_exception(JNIEnv *const env, const char *class_name, const char *message) {
+inline void throw_java_exception(JNIEnv* const env, const char* class_name, const char* message)
+{
   jclass ex_class = env->FindClass(class_name);
-  if (ex_class != NULL) {
-    env->ThrowNew(ex_class, message);
-  }
+  if (ex_class != NULL) { env->ThrowNew(ex_class, message); }
   throw jni_exception(message);
 }
 
@@ -64,7 +64,8 @@ inline void throw_java_exception(JNIEnv *const env, const char *class_name, cons
  * @brief check if an java exceptions have been thrown and if so throw a C++
  * exception so the flow control stop processing.
  */
-inline void check_java_exception(JNIEnv *const env) {
+inline void check_java_exception(JNIEnv* const env)
+{
   if (env->ExceptionCheck()) {
     // Not going to try to get the message out of the Exception, too complex and
     // might fail.
@@ -78,7 +79,9 @@ inline void check_java_exception(JNIEnv *const env) {
  * This is useful when, for instance, converting a cudf::column pointer
  * to a jlong, for use in JNI.
  */
-template <typename T> jlong ptr_as_jlong(T *ptr) {
+template <typename T>
+jlong ptr_as_jlong(T* ptr)
+{
   return reinterpret_cast<jlong>(ptr);
 }
 
@@ -86,7 +89,9 @@ template <typename T> jlong ptr_as_jlong(T *ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>&& ptr)
+{
   return ptr_as_jlong(ptr.release());
 }
 
@@ -94,96 +99,112 @@ template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>& ptr)
+{
   return release_as_jlong(std::move(ptr));
 }
 
 class native_jdoubleArray_accessor {
-public:
-  jdouble *getArrayElements(JNIEnv *const env, jdoubleArray arr) const {
+ public:
+  jdouble* getArrayElements(JNIEnv* const env, jdoubleArray arr) const
+  {
     return env->GetDoubleArrayElements(arr, NULL);
   }
 
-  jdoubleArray newArray(JNIEnv *const env, int len) const { return env->NewDoubleArray(len); }
+  jdoubleArray newArray(JNIEnv* const env, int len) const { return env->NewDoubleArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jdoubleArray jarr, int start, int len,
-                      jdouble const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jdoubleArray jarr, int start, int len, jdouble const* arr) const
+  {
     env->SetDoubleArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jdoubleArray jarr, jdouble *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jdoubleArray jarr, jdouble* arr, jint mode) const
+  {
     env->ReleaseDoubleArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jlongArray_accessor {
-public:
-  jlong *getArrayElements(JNIEnv *const env, jlongArray arr) const {
+ public:
+  jlong* getArrayElements(JNIEnv* const env, jlongArray arr) const
+  {
     return env->GetLongArrayElements(arr, NULL);
   }
 
-  jlongArray newArray(JNIEnv *const env, int len) const { return env->NewLongArray(len); }
+  jlongArray newArray(JNIEnv* const env, int len) const { return env->NewLongArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jlongArray jarr, int start, int len,
-                      jlong const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jlongArray jarr, int start, int len, jlong const* arr) const
+  {
     env->SetLongArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jlongArray jarr, jlong *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jlongArray jarr, jlong* arr, jint mode) const
+  {
     env->ReleaseLongArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jintArray_accessor {
-public:
-  jint *getArrayElements(JNIEnv *const env, jintArray arr) const {
+ public:
+  jint* getArrayElements(JNIEnv* const env, jintArray arr) const
+  {
     return env->GetIntArrayElements(arr, NULL);
   }
 
-  jintArray newArray(JNIEnv *const env, int len) const { return env->NewIntArray(len); }
+  jintArray newArray(JNIEnv* const env, int len) const { return env->NewIntArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jintArray jarr, int start, int len,
-                      jint const *arr) const {
+  void setArrayRegion(JNIEnv* const env, jintArray jarr, int start, int len, jint const* arr) const
+  {
     env->SetIntArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jintArray jarr, jint *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jintArray jarr, jint* arr, jint mode) const
+  {
     env->ReleaseIntArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbyteArray_accessor {
-public:
-  jbyte *getArrayElements(JNIEnv *const env, jbyteArray arr) const {
+ public:
+  jbyte* getArrayElements(JNIEnv* const env, jbyteArray arr) const
+  {
     return env->GetByteArrayElements(arr, NULL);
   }
 
-  jbyteArray newArray(JNIEnv *const env, int len) const { return env->NewByteArray(len); }
+  jbyteArray newArray(JNIEnv* const env, int len) const { return env->NewByteArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbyteArray jarr, int start, int len,
-                      jbyte const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbyteArray jarr, int start, int len, jbyte const* arr) const
+  {
     env->SetByteArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbyteArray jarr, jbyte *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbyteArray jarr, jbyte* arr, jint mode) const
+  {
     env->ReleaseByteArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbooleanArray_accessor {
-public:
-  jboolean *getArrayElements(JNIEnv *const env, jbooleanArray arr) const {
+ public:
+  jboolean* getArrayElements(JNIEnv* const env, jbooleanArray arr) const
+  {
     return env->GetBooleanArrayElements(arr, NULL);
   }
 
-  jbooleanArray newArray(JNIEnv *const env, int len) const { return env->NewBooleanArray(len); }
+  jbooleanArray newArray(JNIEnv* const env, int len) const { return env->NewBooleanArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbooleanArray jarr, int start, int len,
-                      jboolean const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbooleanArray jarr, int start, int len, jboolean const* arr) const
+  {
     env->SetBooleanArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbooleanArray jarr, jboolean *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbooleanArray jarr, jboolean* arr, jint mode) const
+  {
     env->ReleaseBooleanArrayElements(jarr, arr, mode);
   }
 };
@@ -194,47 +215,52 @@ class native_jbooleanArray_accessor {
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class native_jArray {
-private:
+template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR>
+class native_jArray {
+ private:
   ACCESSOR access{};
-  JNIEnv *const env;
+  JNIEnv* const env;
   J_ARRAY_TYPE orig;
   int len;
-  mutable N_TYPE *data_ptr;
+  mutable N_TYPE* data_ptr;
 
-  void init_data_ptr() const {
+  void init_data_ptr() const
+  {
     if (orig != nullptr && data_ptr == nullptr) {
       data_ptr = access.getArrayElements(env, orig);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jArray(native_jArray const &) = delete;
-  native_jArray &operator=(native_jArray const &) = delete;
+ public:
+  native_jArray(native_jArray const&)            = delete;
+  native_jArray& operator=(native_jArray const&) = delete;
 
-  native_jArray(JNIEnv *const env, J_ARRAY_TYPE orig)
-      : env(env), orig(orig), len(0), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, J_ARRAY_TYPE orig) : env(env), orig(orig), len(0), data_ptr(NULL)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
     }
   }
 
-  native_jArray(JNIEnv *const env, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, N_TYPE const *arr, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, N_TYPE const* arr, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr);
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, const std::vector<N_TYPE> &arr)
-      : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, const std::vector<N_TYPE>& arr)
+    : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr.data());
     check_java_exception(env);
@@ -244,43 +270,39 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 
   int size() const noexcept { return len; }
 
-  N_TYPE operator[](int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE operator[](int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  N_TYPE &operator[](int index) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE& operator[](int index)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  const N_TYPE *const data() const {
+  const N_TYPE* const data() const
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  N_TYPE *data() {
+  N_TYPE* data()
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  const N_TYPE *const begin() const { return data(); }
+  const N_TYPE* const begin() const { return data(); }
 
-  N_TYPE *begin() { return data(); }
+  N_TYPE* begin() { return data(); }
 
-  const N_TYPE *const end() const { return data() + size(); }
+  const N_TYPE* const end() const { return data() + size(); }
 
-  N_TYPE *end() { return data() + size(); }
+  N_TYPE* end() { return data() + size(); }
 
   const J_ARRAY_TYPE get_jArray() const { return orig; }
 
@@ -292,7 +314,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @tparam target_t Target data type
    * @return std::vector<target_t> Vector with the copied contents
    */
-  template <typename target_t = N_TYPE> std::vector<target_t> to_vector() const {
+  template <typename target_t = N_TYPE>
+  std::vector<target_t> to_vector() const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
     std::copy(begin(), end(), std::back_inserter(ret));
@@ -303,14 +327,16 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @brief if data has been written back into this array, don't commit
    * it.
    */
-  void cancel() {
+  void cancel()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, JNI_ABORT);
       data_ptr = NULL;
     }
   }
 
-  void commit() {
+  void commit()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, 0);
       data_ptr = NULL;
@@ -321,9 +347,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 };
 
 using native_jdoubleArray = native_jArray<jdouble, jdoubleArray, native_jdoubleArray_accessor>;
-using native_jlongArray = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
-using native_jintArray = native_jArray<jint, jintArray, native_jintArray_accessor>;
-using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
+using native_jlongArray   = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
+using native_jintArray    = native_jArray<jint, jintArray, native_jintArray_accessor>;
+using native_jbyteArray   = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
 
 /**
  * @brief Specialization of native_jArray for jboolean
@@ -332,19 +358,23 @@ using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_acc
  * value is chosen depending on the jboolean value.
  */
 struct native_jbooleanArray
-    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
-  native_jbooleanArray(JNIEnv *const env, jbooleanArray orig)
-      : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig) {}
+  : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
+  native_jbooleanArray(JNIEnv* const env, jbooleanArray orig)
+    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig)
+  {
+  }
 
-  native_jbooleanArray(native_jbooleanArray const &) = delete;
-  native_jbooleanArray &operator=(native_jbooleanArray const &) = delete;
+  native_jbooleanArray(native_jbooleanArray const&)            = delete;
+  native_jbooleanArray& operator=(native_jbooleanArray const&) = delete;
 
   template <typename target_t>
-  std::vector<target_t> transform_if_else(target_t const &if_true, target_t const &if_false) const {
+  std::vector<target_t> transform_if_else(target_t const& if_true, target_t const& if_false) const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
-    std::transform(begin(), end(), std::back_inserter(ret),
-                   [&](jboolean const &b) { return b ? if_true : if_false; });
+    std::transform(begin(), end(), std::back_inserter(ret), [&](jboolean const& b) {
+      return b ? if_true : if_false;
+    });
     return ret;
   }
 };
@@ -355,58 +385,58 @@ struct native_jbooleanArray
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename T> class native_jpointerArray {
-private:
+template <typename T>
+class native_jpointerArray {
+ private:
   native_jlongArray wrapped;
-  JNIEnv *const env;
+  JNIEnv* const env;
 
-public:
-  native_jpointerArray(native_jpointerArray const &) = delete;
-  native_jpointerArray &operator=(native_jpointerArray const &) = delete;
+ public:
+  native_jpointerArray(native_jpointerArray const&)            = delete;
+  native_jpointerArray& operator=(native_jpointerArray const&) = delete;
 
-  native_jpointerArray(JNIEnv *const env, jlongArray orig) : wrapped(env, orig), env(env) {}
+  native_jpointerArray(JNIEnv* const env, jlongArray orig) : wrapped(env, orig), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, int len) : wrapped(env, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, int len) : wrapped(env, len), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, T *arr, int len) : wrapped(env, arr, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, T* arr, int len) : wrapped(env, arr, len), env(env) {}
 
   bool is_null() const noexcept { return wrapped.is_null(); }
 
   int size() const noexcept { return wrapped.size(); }
 
-  T *operator[](int index) const {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T* operator[](int index) const
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *&operator[](int index) {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T*& operator[](int index)
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *const *data() const { return reinterpret_cast<T *const *>(wrapped.data()); }
+  T* const* data() const { return reinterpret_cast<T* const*>(wrapped.data()); }
 
-  T **data() { return reinterpret_cast<T **>(wrapped.data()); }
+  T** data() { return reinterpret_cast<T**>(wrapped.data()); }
 
-  T *const *begin() const { return data(); }
-  T *const *end() const { return data() + size(); }
+  T* const* begin() const { return data(); }
+  T* const* end() const { return data() + size(); }
 
   const jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   jlongArray get_jArray() { return wrapped.get_jArray(); }
 
-  void assert_no_nulls() const {
-    if (std::any_of(data(), data() + size(), [](T *const ptr) { return ptr == nullptr; })) {
+  void assert_no_nulls() const
+  {
+    if (std::any_of(data(), data() + size(), [](T* const ptr) { return ptr == nullptr; })) {
       throw_java_exception(env, NPE_CLASS, "pointer is NULL");
     }
   }
@@ -414,12 +444,13 @@ template <typename T> class native_jpointerArray {
   /**
    * @brief Convert from `T*[]` to `vector<T>`.
    */
-  std::vector<T> get_dereferenced() const {
+  std::vector<T> get_dereferenced() const
+  {
     assert_no_nulls();
     auto ret = std::vector<T>{};
     ret.reserve(size());
-    std::transform(data(), data() + size(), std::back_inserter(ret),
-                   [](T *const &p) { return *p; });
+    std::transform(
+      data(), data() + size(), std::back_inserter(ret), [](T* const& p) { return *p; });
     return ret;
   }
 
@@ -439,73 +470,82 @@ template <typename T> class native_jpointerArray {
  * By default any changes to the array will be committed back when
  * released unless cancel is called first.
  */
-template <typename T, typename D = std::default_delete<T>> class unique_jpointerArray {
-private:
+template <typename T, typename D = std::default_delete<T>>
+class unique_jpointerArray {
+ private:
   std::unique_ptr<native_jpointerArray<T>> wrapped;
   D del;
 
-public:
-  unique_jpointerArray(unique_jpointerArray const &) = delete;
-  unique_jpointerArray &operator=(unique_jpointerArray const &) = delete;
+ public:
+  unique_jpointerArray(unique_jpointerArray const&)            = delete;
+  unique_jpointerArray& operator=(unique_jpointerArray const&) = delete;
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig)
-      : wrapped(new native_jpointerArray<T>(env, orig)) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig)
+    : wrapped(new native_jpointerArray<T>(env, orig))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, orig)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, orig)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len)
-      : wrapped(new native_jpointerArray<T>(env, len)) {}
+  unique_jpointerArray(JNIEnv* const env, int len) : wrapped(new native_jpointerArray<T>(env, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, len)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len)
+    : wrapped(new native_jpointerArray<T>(env, arr, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del)
+  {
+  }
 
   bool is_null() const noexcept { return wrapped == NULL || wrapped->is_null(); }
 
   int size() const noexcept { return wrapped == NULL ? 0 : wrapped->size(); }
 
-  void reset(int index, T *new_ptr = NULL) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
-    T *old = (*wrapped)[index];
+  void reset(int index, T* new_ptr = NULL)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
+    T* old = (*wrapped)[index];
     if (old != new_ptr) {
       (*wrapped)[index] = new_ptr;
       del(old);
     }
   }
 
-  T *get(int index) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* get(int index)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return (*wrapped)[index];
   }
 
-  T *const *get() {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* const* get()
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return wrapped->data();
   }
 
-  jlongArray release() {
-    if (wrapped == NULL) {
-      return NULL;
-    }
+  jlongArray release()
+  {
+    if (wrapped == NULL) { return NULL; }
     wrapped->commit();
     jlongArray ret = wrapped->get_jArray();
     wrapped.reset(NULL);
     return ret;
   }
 
-  ~unique_jpointerArray() {
+  ~unique_jpointerArray()
+  {
     if (wrapped != NULL) {
       for (int i = 0; i < wrapped->size(); i++) {
         reset(i, NULL);
@@ -518,57 +558,62 @@ template <typename T, typename D = std::default_delete<T>> class unique_jpointer
  * @brief RAII for jstring to be sure it is handled correctly.
  */
 class native_jstring {
-private:
-  JNIEnv *env;
+ private:
+  JNIEnv* env;
   jstring orig;
-  mutable const char *cstr;
+  mutable const char* cstr;
   mutable size_t cstr_length;
 
-  void init_cstr() const {
+  void init_cstr() const
+  {
     if (orig != NULL && cstr == NULL) {
       cstr_length = env->GetStringUTFLength(orig);
-      cstr = env->GetStringUTFChars(orig, 0);
+      cstr        = env->GetStringUTFChars(orig, 0);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jstring(native_jstring const &) = delete;
-  native_jstring &operator=(native_jstring const &) = delete;
+ public:
+  native_jstring(native_jstring const&)            = delete;
+  native_jstring& operator=(native_jstring const&) = delete;
 
-  native_jstring(native_jstring &&other) noexcept
-      : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length) {
+  native_jstring(native_jstring&& other) noexcept
+    : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length)
+  {
     other.cstr = NULL;
   }
 
-  native_jstring(JNIEnv *const env, jstring orig)
-      : env(env), orig(orig), cstr(NULL), cstr_length(0) {}
+  native_jstring(JNIEnv* const env, jstring orig) : env(env), orig(orig), cstr(NULL), cstr_length(0)
+  {
+  }
 
-  native_jstring &operator=(native_jstring const &&other) {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
-    this->env = other.env;
-    this->orig = other.orig;
-    this->cstr = other.cstr;
+  native_jstring& operator=(native_jstring const&& other)
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
+    this->env         = other.env;
+    this->orig        = other.orig;
+    this->cstr        = other.cstr;
     this->cstr_length = other.cstr_length;
-    other.cstr = NULL;
+    other.cstr        = NULL;
     return *this;
   }
 
   bool is_null() const noexcept { return orig == NULL; }
 
-  const char *get() const {
+  const char* get() const
+  {
     init_cstr();
     return cstr;
   }
 
-  size_t size_bytes() const {
+  size_t size_bytes() const
+  {
     init_cstr();
     return cstr_length;
   }
 
-  bool is_empty() const {
+  bool is_empty() const
+  {
     if (cstr != NULL) {
       return cstr_length <= 0;
     } else if (orig != NULL) {
@@ -581,24 +626,25 @@ class native_jstring {
 
   const jstring get_jstring() const { return orig; }
 
-  ~native_jstring() {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
+  ~native_jstring()
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
   }
 };
 
 /**
  * @brief jobjectArray wrapper to make accessing it more convenient.
  */
-template <typename T> class native_jobjectArray {
-private:
-  JNIEnv *const env;
+template <typename T>
+class native_jobjectArray {
+ private:
+  JNIEnv* const env;
   jobjectArray orig;
   int len;
 
-public:
-  native_jobjectArray(JNIEnv *const env, jobjectArray orig) : env(env), orig(orig), len(0) {
+ public:
+  native_jobjectArray(JNIEnv* const env, jobjectArray orig) : env(env), orig(orig), len(0)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
@@ -611,19 +657,17 @@ template <typename T> class native_jobjectArray {
 
   T operator[](int index) const { return get(index); }
 
-  T get(int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  T get(int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     T ret = static_cast<T>(env->GetObjectArrayElement(orig, index));
     check_java_exception(env);
     return ret;
   }
 
-  void set(int index, const T &val) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  void set(int index, const T& val)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     env->SetObjectArrayElement(orig, index, val);
     check_java_exception(env);
   }
@@ -636,14 +680,15 @@ template <typename T> class native_jobjectArray {
  * and convenient.
  */
 class native_jstringArray {
-private:
-  JNIEnv *const env;
+ private:
+  JNIEnv* const env;
   native_jobjectArray<jstring> arr;
   mutable std::vector<native_jstring> cache;
   mutable std::vector<std::string> cpp_cache;
-  mutable std::vector<const char *> c_cache;
+  mutable std::vector<const char*> c_cache;
 
-  void init_cache() const {
+  void init_cache() const
+  {
     if (!arr.is_null() && cache.empty()) {
       int size = this->size();
       cache.reserve(size);
@@ -653,7 +698,8 @@ class native_jstringArray {
     }
   }
 
-  void init_c_cache() const {
+  void init_c_cache() const
+  {
     if (!arr.is_null() && c_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -664,7 +710,8 @@ class native_jstringArray {
     }
   }
 
-  void init_cpp_cache() const {
+  void init_cpp_cache() const
+  {
     if (!arr.is_null() && cpp_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -675,32 +722,30 @@ class native_jstringArray {
     }
   }
 
-  void update_caches(int index, jstring val) {
+  void update_caches(int index, jstring val)
+  {
     if (!cache.empty()) {
       cache[index] = native_jstring(env, val);
-      if (!c_cache.empty()) {
-        c_cache[index] = cache[index].get();
-      }
+      if (!c_cache.empty()) { c_cache[index] = cache[index].get(); }
 
-      if (!cpp_cache.empty()) {
-        cpp_cache[index] = cache[index].get();
-      }
+      if (!cpp_cache.empty()) { cpp_cache[index] = cache[index].get(); }
     } else if (!c_cache.empty() || !cpp_cache.empty()) {
       // Illegal state
       throw std::logic_error("CACHING IS MESSED UP");
     }
   }
 
-public:
-  native_jstringArray(JNIEnv *const env, jobjectArray orig) : env(env), arr(env, orig) {}
+ public:
+  native_jstringArray(JNIEnv* const env, jobjectArray orig) : env(env), arr(env, orig) {}
 
   bool is_null() const noexcept { return arr.is_null(); }
 
   int size() const noexcept { return arr.size(); }
 
-  native_jstring &operator[](int index) const { return get(index); }
+  native_jstring& operator[](int index) const { return get(index); }
 
-  native_jstring &get(int index) const {
+  native_jstring& get(int index) const
+  {
     if (arr.is_null()) {
       throw_java_exception(env, cudf::jni::NPE_CLASS, "jstringArray pointer is NULL");
     }
@@ -708,27 +753,32 @@ class native_jstringArray {
     return cache[index];
   }
 
-  const char **const as_c_array() const {
+  const char** const as_c_array() const
+  {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const {
+  const std::vector<std::string> as_cpp_vector() const
+  {
     init_cpp_cache();
     return cpp_cache;
   }
 
-  void set(int index, jstring val) {
+  void set(int index, jstring val)
+  {
     arr.set(index, val);
     update_caches(index, val);
   }
 
-  void set(int index, const native_jstring &val) {
+  void set(int index, const native_jstring& val)
+  {
     arr.set(index, val.get_jstring());
     update_caches(index, val.get_jstring());
   }
 
-  void set(int index, const char *val) {
+  void set(int index, const char* val)
+  {
     jstring str = env->NewStringUTF(val);
     check_java_exception(env);
     arr.set(index, str);
@@ -739,8 +789,9 @@ class native_jstringArray {
 /**
  * @brief create a cuda exception from a given cudaError_t
  */
-inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
-  const char *ex_class_name;
+inline jthrowable cuda_exception(JNIEnv* const env, cudaError_t status, jthrowable cause = NULL)
+{
+  const char* ex_class_name;
 
   // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
@@ -755,19 +806,13 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   }
 
   jclass ex_class = env->FindClass(ex_class_name);
-  if (ex_class == NULL) {
-    return NULL;
-  }
+  if (ex_class == NULL) { return NULL; }
   jmethodID ctor_id =
-      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
-  if (ctor_id == NULL) {
-    return NULL;
-  }
+    env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
+  if (ctor_id == NULL) { return NULL; }
 
   jstring msg = env->NewStringUTF(cudaGetErrorString(status));
-  if (msg == NULL) {
-    return NULL;
-  }
+  if (msg == NULL) { return NULL; }
 
   jint err_code = static_cast<jint>(status);
 
@@ -775,168 +820,146 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   return (jthrowable)ret;
 }
 
-inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
+inline void jni_cuda_check(JNIEnv* const env, cudaError_t cuda_status)
+{
   if (cudaSuccess != cuda_status) {
     jthrowable jt = cuda_exception(env, cuda_status);
-    if (jt != NULL) {
-      env->Throw(jt);
-    }
+    if (jt != NULL) { env->Throw(jt); }
     throw jni_exception(std::string("CUDA ERROR: code ") +
                         std::to_string(static_cast<int>(cuda_status)));
   }
 }
 
-inline auto add_global_ref(JNIEnv *env, jobject jobj) {
+inline auto add_global_ref(JNIEnv* env, jobject jobj)
+{
   auto new_global_ref = env->NewGlobalRef(jobj);
-  if (new_global_ref == nullptr) {
-    throw cudf::jni::jni_exception("global ref");
-  }
+  if (new_global_ref == nullptr) { throw cudf::jni::jni_exception("global ref"); }
   return new_global_ref;
 }
 
-inline nullptr_t del_global_ref(JNIEnv *env, jobject jobj) {
-  if (jobj != nullptr) {
-    env->DeleteGlobalRef(jobj);
-  }
+inline nullptr_t del_global_ref(JNIEnv* env, jobject jobj)
+{
+  if (jobj != nullptr) { env->DeleteGlobalRef(jobj); }
   return nullptr;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
-#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)                                                 \
-  {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
-      return ret_val;                                                                              \
-    }                                                                                              \
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)    \
+  {                                                   \
+    if (env->ExceptionOccurred()) { return ret_val; } \
   }
 
-#define JNI_THROW_NEW(env, class_name, message, ret_val)                                           \
-  {                                                                                                \
-    jclass ex_class = env->FindClass(class_name);                                                  \
-    if (ex_class == NULL) {                                                                        \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->ThrowNew(ex_class, message);                                                              \
-    return ret_val;                                                                                \
+#define JNI_THROW_NEW(env, class_name, message, ret_val) \
+  {                                                      \
+    jclass ex_class = env->FindClass(class_name);        \
+    if (ex_class == NULL) { return ret_val; }            \
+    env->ThrowNew(ex_class, message);                    \
+    return ret_val;                                      \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)              \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");           \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                    \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
+#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)           \
+  {                                                                                             \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                 \
+    auto const ex_class = env->FindClass(class_name);                                           \
+    if (ex_class == nullptr) { return ret_val; }                                                \
+    auto const ctor_id =                                                                        \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");          \
+    if (ctor_id == nullptr) { return ret_val; }                                                 \
+    auto const empty_str = std::string{""};                                                     \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message); \
+    if (jmessage == nullptr) { return ret_val; }                                                \
+    auto const jstacktrace =                                                                    \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                \
+    if (jstacktrace == nullptr) { return ret_val; }                                             \
+    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                 \
+    if (jobj == nullptr) { return ret_val; }                                                    \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                             \
+    return ret_val;                                                                             \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)  \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");          \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jerror_code = static_cast<jint>(error_code);                                        \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code);       \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
-  }
-
-#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
-  {                                                                                                \
-    if ((obj) == 0) {                                                                              \
-      JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val);                                \
-    }                                                                                              \
-  }
-
-#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                                \
-  {                                                                                                \
-    if (!(obj)) {                                                                                  \
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val);                        \
-    }                                                                                              \
-  }
-
-#define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
-  catch (const rmm::out_of_memory &e) {                                                            \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const what =                                                                              \
-        std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
-    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                               \
-  }                                                                                                \
-  catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), e.error_code(), ret_val);                       \
-  }                                                                                                \
-  catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(),     \
-                                   e.error_code(), ret_val);                                       \
-  }                                                                                                \
-  catch (const cudf::data_type_error &e) {                                                         \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), ret_val);                                       \
-  }                                                                                                \
-  catch (std::overflow_error const &e) {                                                           \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_OVERFLOW_ERROR_CLASS, e.what(),            \
-                                   "No native stacktrace is available.", ret_val);                 \
-  }                                                                                                \
-  catch (const std::exception &e) {                                                                \
-    char const *stacktrace = "No native stacktrace is available.";                                 \
-    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const *>(&e); cudf_ex != nullptr) {    \
-      stacktrace = cudf_ex->stacktrace();                                                          \
-    }                                                                                              \
-    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
-    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
-    /* occurred if the second call doesn't return with cudaSuccess. */                             \
-    cudaGetLastError();                                                                            \
-    auto const last = cudaFree(0);                                                                 \
-    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                  \
-      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */        \
-      JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, \
-                                     last, ret_val);                                               \
-    }                                                                                              \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);                \
+#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)   \
+  {                                                                                                 \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                     \
+    auto const ex_class = env->FindClass(class_name);                                               \
+    if (ex_class == nullptr) { return ret_val; }                                                    \
+    auto const ctor_id =                                                                            \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");             \
+    if (ctor_id == nullptr) { return ret_val; }                                                     \
+    auto const empty_str = std::string{""};                                                         \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
+    if (jmessage == nullptr) { return ret_val; }                                                    \
+    auto const jstacktrace =                                                                        \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                    \
+    if (jstacktrace == nullptr) { return ret_val; }                                                 \
+    auto const jerror_code = static_cast<jint>(error_code);                                         \
+    auto const jobj        = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code); \
+    if (jobj == nullptr) { return ret_val; }                                                        \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                 \
+    return ret_val;                                                                                 \
+  }
+
+#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                  \
+  {                                                                                   \
+    if ((obj) == 0) { JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val); } \
+  }
+
+#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                       \
+  {                                                                                       \
+    if (!(obj)) { JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val); } \
+  }
+
+#define CATCH_STD_CLASS(env, class_name, ret_val)                                                 \
+  catch (const rmm::out_of_memory& e)                                                             \
+  {                                                                                               \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                   \
+    auto const what =                                                                             \
+      std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what());  \
+    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                              \
+  }                                                                                               \
+  catch (const cudf::fatal_cuda_error& e)                                                         \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val); \
+  }                                                                                               \
+  catch (const cudf::cuda_error& e)                                                               \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val);       \
+  }                                                                                               \
+  catch (const cudf::data_type_error& e)                                                          \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(                                                               \
+      env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(), e.stacktrace(), ret_val);                 \
+  }                                                                                               \
+  catch (std::overflow_error const& e)                                                            \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env,                                                           \
+                                   cudf::jni::CUDF_OVERFLOW_ERROR_CLASS,                          \
+                                   e.what(),                                                      \
+                                   "No native stacktrace is available.",                          \
+                                   ret_val);                                                      \
+  }                                                                                               \
+  catch (const std::exception& e)                                                                 \
+  {                                                                                               \
+    char const* stacktrace = "No native stacktrace is available.";                                \
+    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const*>(&e); cudf_ex != nullptr) {    \
+      stacktrace = cudf_ex->stacktrace();                                                         \
+    }                                                                                             \
+    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */           \
+    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */           \
+    /* occurred if the second call doesn't return with cudaSuccess. */                            \
+    cudaGetLastError();                                                                           \
+    auto const last = cudaFree(0);                                                                \
+    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                 \
+      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */       \
+      JNI_CHECK_THROW_CUDA_EXCEPTION(                                                             \
+        env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, last, ret_val);             \
+    }                                                                                             \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);               \
   }
 
 #define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, cudf::jni::CUDF_ERROR_CLASS, ret_val)
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index 5ac8d5c5713..be25dbd2e55 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -36,16 +39,16 @@ namespace jni {
  * retrieve the corresponding value.
  */
 class maps_column_view {
-public:
-  maps_column_view(lists_column_view const &lists_of_structs,
+ public:
+  maps_column_view(lists_column_view const& lists_of_structs,
                    rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   // Rule of 5.
-  maps_column_view(maps_column_view const &maps_view) = default;
-  maps_column_view(maps_column_view &&maps_view) = default;
-  maps_column_view &operator=(maps_column_view const &) = default;
-  maps_column_view &operator=(maps_column_view &&) = default;
-  ~maps_column_view() = default;
+  maps_column_view(maps_column_view const& maps_view)  = default;
+  maps_column_view(maps_column_view&& maps_view)       = default;
+  maps_column_view& operator=(maps_column_view const&) = default;
+  maps_column_view& operator=(maps_column_view&&)      = default;
+  ~maps_column_view()                                  = default;
 
   /**
    * @brief Returns number of map rows in the column.
@@ -57,14 +60,14 @@ class maps_column_view {
    *
    * Note: Keys are not deduped. Repeated keys are returned in order.
    */
-  lists_column_view const &keys() const { return keys_; }
+  lists_column_view const& keys() const { return keys_; }
 
   /**
    * @brief Getter for values as a list column.
    *
    * Note: Values for repeated keys are not dropped.
    */
-  lists_column_view const &values() const { return values_; }
+  lists_column_view const& values() const { return values_; }
 
   /**
    * @brief Map lookup by a column of keys.
@@ -82,8 +85,9 @@ class maps_column_view {
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
   std::unique_ptr<column> get_values_for(
-      column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+    column_view const& keys,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -100,8 +104,9 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
   std::unique_ptr<column> get_values_for(
-      scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -119,9 +124,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column>
-  contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -140,13 +146,14 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
 
-  std::unique_ptr<column>
-  contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    column_view const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
-private:
+ private:
   lists_column_view keys_, values_;
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/Aggregation128UtilsJni.cpp b/java/src/main/native/src/Aggregation128UtilsJni.cpp
index 71c36cb724a..ed8a8dc1e5c 100644
--- a/java/src/main/native/src/Aggregation128UtilsJni.cpp
+++ b/java/src/main/native/src/Aggregation128UtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,12 @@
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
-    JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
+  JNIEnv* env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cview = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
     return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
   }
@@ -33,13 +34,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chun
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
-    JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
+  JNIEnv* env, jclass, jlong j_table_view, jint j_dtype, jint j_scale)
+{
   JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
+    auto tview = reinterpret_cast<cudf::table_view const*>(j_table_view);
     std::unique_ptr<cudf::table> result =
-        cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
+      cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index bc62e95c36a..c40f1c55500 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,85 +14,91 @@
  * limitations under the License.
  */
 
-#include <cudf/aggregation.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/aggregation.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv *env, jclass class_object,
-                                                             jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv* env,
+                                                             jclass class_object,
+                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto to_del = reinterpret_cast<cudf::aggregation *>(ptr);
+    auto to_del = reinterpret_cast<cudf::aggregation*>(ptr);
     delete to_del;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind) {
+                                                                         jint kind)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = [&] {
       // These numbers come from Aggregation.java and must stay in sync
       switch (kind) {
-        case 0: // SUM
+        case 0:  // SUM
           return cudf::make_sum_aggregation();
-        case 1: // PRODUCT
+        case 1:  // PRODUCT
           return cudf::make_product_aggregation();
-        case 2: // MIN
+        case 2:  // MIN
           return cudf::make_min_aggregation();
-        case 3: // MAX
+        case 3:  // MAX
           return cudf::make_max_aggregation();
         // case 4 COUNT
-        case 5: // ANY
+        case 5:  // ANY
           return cudf::make_any_aggregation();
-        case 6: // ALL
+        case 6:  // ALL
           return cudf::make_all_aggregation();
-        case 7: // SUM_OF_SQUARES
+        case 7:  // SUM_OF_SQUARES
           return cudf::make_sum_of_squares_aggregation();
-        case 8: // MEAN
+        case 8:  // MEAN
           return cudf::make_mean_aggregation();
         // case 9: VARIANCE
         // case 10: STD
-        case 11: // MEDIAN
+        case 11:  // MEDIAN
           return cudf::make_median_aggregation();
         // case 12: QUANTILE
-        case 13: // ARGMAX
+        case 13:  // ARGMAX
           return cudf::make_argmax_aggregation();
-        case 14: // ARGMIN
+        case 14:  // ARGMIN
           return cudf::make_argmin_aggregation();
         // case 15: NUNIQUE
         // case 16: NTH_ELEMENT
-        case 17: // ROW_NUMBER
+        case 17:  // ROW_NUMBER
           return cudf::make_row_number_aggregation();
         // case 18: COLLECT_LIST
         // case 19: COLLECT_SET
-        case 20: // MERGE_LISTS
+        case 20:  // MERGE_LISTS
           return cudf::make_merge_lists_aggregation();
         // case 21: MERGE_SETS
         // case 22: LEAD
         // case 23: LAG
         // case 24: PTX
         // case 25: CUDA
-        case 26: // M2
+        case 26:  // M2
           return cudf::make_m2_aggregation();
-        case 27: // MERGE_M2
+        case 27:  // MERGE_M2
           return cudf::make_merge_m2_aggregation();
-        case 28: // RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {},
-                                             cudf::null_policy::INCLUDE);
-        case 29: // DENSE_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {},
-                                             cudf::null_policy::INCLUDE);
-        case 30: // ANSI SQL PERCENT_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
-                                             {}, cudf::rank_percentage::ONE_NORMALIZED);
-        case 33: // HISTOGRAM
+        case 28:  // RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE);
+        case 29:  // DENSE_RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE);
+        case 30:  // ANSI SQL PERCENT_RANK
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN,
+                                             {},
+                                             cudf::null_policy::INCLUDE,
+                                             {},
+                                             cudf::rank_percentage::ONE_NORMALIZED);
+        case 33:  // HISTOGRAM
           return cudf::make_histogram_aggregation();
-        case 34: // MERGE_HISTOGRAM
+        case 34:  // MERGE_HISTOGRAM
           return cudf::make_merge_histogram_aggregation();
 
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
@@ -104,33 +110,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv* env,
                                                                      jclass class_object,
                                                                      jint offset,
-                                                                     jboolean include_nulls) {
+                                                                     jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret = cudf::make_nth_element_aggregation(
-        offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
+      offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv* env,
                                                                       jclass class_object,
-                                                                      jint kind, jint ddof) {
+                                                                      jint kind,
+                                                                      jint ddof)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 9: // VARIANCE
+      case 9:  // VARIANCE
         ret = cudf::make_variance_aggregation(ddof);
         break;
-      case 10: // STD
+      case 10:  // STD
         ret = cudf::make_std_aggregation(ddof);
         break;
       default: throw std::logic_error("Unsupported DDOF Aggregation Operation");
@@ -140,19 +149,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint delta) {
+                                                                         jint kind,
+                                                                         jint delta)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 31: // TDIGEST
+      case 31:  // TDIGEST
         ret = cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
-      case 32: // MERGE_TDIGEST
+      case 32:  // MERGE_TDIGEST
         ret = cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
       default: throw std::logic_error("Unsupported TDigest Aggregation Operation");
@@ -162,22 +173,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jint kind,
-                                                                           jboolean include_nulls) {
+                                                                           jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 4: // COUNT
+      case 4:  // COUNT
         ret = cudf::make_count_aggregation(policy);
         break;
-      case 15: // NUNIQUE
+      case 15:  // NUNIQUE
         ret = cudf::make_nunique_aggregation(policy);
         break;
       default: throw std::logic_error("Unsupported Count Like Aggregation Operation");
@@ -187,10 +199,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv* env,
                                                                        jclass class_object,
                                                                        jint j_method,
-                                                                       jdoubleArray j_quantiles) {
+                                                                       jdoubleArray j_quantiles)
+{
   JNI_NULL_CHECK(env, j_quantiles, "quantiles are null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -206,19 +219,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint offset) {
+                                                                         jint kind,
+                                                                         jint offset)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 22: // LEAD
+      case 22:  // LEAD
         ret = cudf::make_lead_aggregation(offset);
         break;
-      case 23: // LAG
+      case 23:  // LAG
         ret = cudf::make_lag_aggregation(offset);
         break;
       default: throw std::logic_error("Unsupported Lead/Lag Aggregation Operation");
@@ -228,53 +243,57 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(
-    JNIEnv *env, jclass class_object, jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(JNIEnv* env,
+                                                                             jclass class_object,
+                                                                             jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_list_aggregation(policy);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv* env,
                                                                             jclass class_object,
                                                                             jboolean include_nulls,
                                                                             jboolean nulls_equal,
-                                                                            jboolean nans_equal) {
+                                                                            jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy null_policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
+      cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jboolean nulls_equal,
-                                                                           jboolean nans_equal) {
+                                                                           jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_merge_sets_aggregation(null_equality, nan_equality);
+      cudf::make_merge_sets_aggregation(null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedPackJni.cpp b/java/src/main/native/src/ChunkedPackJni.cpp
index 746a67e1463..2512d74a113 100644
--- a/java/src/main/native/src/ChunkedPackJni.cpp
+++ b/java/src/main/native/src/ChunkedPackJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,59 +17,65 @@
 #include "cudf_jni_apis.hpp"
 
 extern "C" {
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv *env, jclass,
-                                                                         jlong chunked_pack) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     delete cs;
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackGetTotalContiguousSize(
-    JNIEnv *env, jclass, jlong chunked_pack) {
+  JNIEnv* env, jclass, jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->get_total_contiguous_size();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv *env, jclass,
-                                                                              jlong chunked_pack) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->has_next();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(JNIEnv *env, jclass,
-                                                                        jlong chunked_pack,
-                                                                        jlong user_ptr,
-                                                                        jlong user_ptr_size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(
+  JNIEnv* env, jclass, jlong chunked_pack, jlong user_ptr, jlong user_ptr_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
-    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t *>(user_ptr),
+    auto cs               = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
+    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t*>(user_ptr),
                                                        static_cast<std::size_t>(user_ptr_size));
     return cs->next(user_buffer_span);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv *env, jclass, jlong chunked_pack) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv* env,
+                                                                                 jclass,
+                                                                                 jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     std::unique_ptr<std::vector<uint8_t>> result = cs->build_metadata();
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 5ce23bbe712..cf04a87262f 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,48 +14,57 @@
  * limitations under the License.
  */
 
-#include <memory>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
-
-// This function is defined in `TableJni.cpp`.
-jlongArray
-cudf::jni::convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns);
+#include <memory>
+#include <optional>
+#include <vector>
 
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
 extern "C" {
 
+//
+// Chunked Parquet reader JNI
+//
+
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jlong pass_read_limit,
-    jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path,
-    jlong buffer, jlong buffer_length, jint unit) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
+                                                jclass,
+                                                jlong chunk_read_limit,
+                                                jlong pass_read_limit,
+                                                jobjectArray filter_col_names,
+                                                jbooleanArray j_col_binary_read,
+                                                jstring inp_file_path,
+                                                jlong buffer,
+                                                jlong buffer_length,
+                                                jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path", 0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
   } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -66,29 +75,35 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ?
-                            cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                  static_cast<std::size_t>(buffer_length)) :
-                            cudf::io::source_info(filename.get());
+    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                            static_cast<std::size_t>(buffer_length))
+                                    : cudf::io::source_info(filename.get());
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
     return reinterpret_cast<jlong>(
-        new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
-                                             static_cast<std::size_t>(pass_read_limit), read_opts));
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
+                                           static_cast<std::size_t>(pass_read_limit),
+                                           read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
-    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(JNIEnv* env,
+                                                              jclass,
+                                                              jlong chunk_read_limit,
+                                                              jobjectArray filter_col_names,
+                                                              jbooleanArray j_col_binary_read,
+                                                              jint unit,
+                                                              jlong ds_handle)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
 
@@ -103,7 +118,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
@@ -111,49 +126,202 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
-    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
-        static_cast<std::size_t>(chunk_read_limit), read_opts));
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit), read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
     return reader_ptr->has_next();
   }
   CATCH_STD(env, false);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv *env, jclass,
-                                                                                jlong handle) {
-  JNI_NULL_CHECK(env, handle, "handle is null", 0);
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
-    auto chunk = reader_ptr->read_chunk();
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
+  CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cudf::io::chunked_parquet_reader*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
+//
+// Chunked ORC reader JNI
+//
+
+namespace {
+jlong create_chunked_orc_reader(JNIEnv* env,
+                                jlong chunk_read_limit,
+                                jlong pass_read_limit,
+                                std::optional<jlong> output_granularity,
+                                jobjectArray filter_col_names,
+                                jlong buffer,
+                                jlong buffer_length,
+                                jboolean using_numpy_Types,
+                                jint unit,
+                                jobjectArray dec128_col_names)
+{
+  JNI_NULL_CHECK(env, buffer, "buffer is null", 0);
+  if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto const source = cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                              static_cast<std::size_t>(buffer_length));
+    auto opts_builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.use_index(false)
+                             .use_np_dtypes(static_cast<bool>(using_numpy_Types))
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+                             .build();
+
+    if (output_granularity) {
+      return reinterpret_cast<jlong>(
+        new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                         static_cast<std::size_t>(pass_read_limit),
+                                         static_cast<std::size_t>(output_granularity.value()),
+                                         read_opts));
+    }
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                       static_cast<std::size_t>(pass_read_limit),
+                                       read_opts));
+  }
   CATCH_STD(env, 0);
 }
+}  // namespace
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus two more parameters: `chunk_read_limit` and `pass_read_limit`.
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ORCChunkedReader_createReader(JNIEnv* env,
+                                                  jclass,
+                                                  jlong chunk_read_limit,
+                                                  jlong pass_read_limit,
+                                                  jobjectArray filter_col_names,
+                                                  jlong buffer,
+                                                  jlong buffer_length,
+                                                  jboolean using_numpy_Types,
+                                                  jint unit,
+                                                  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   std::nullopt,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus three more parameters: `chunk_read_limit`, `pass_read_limit`, `output_granularity`.
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ORCChunkedReader_createReaderWithOutputGranularity(
+  JNIEnv* env,
+  jclass,
+  jlong chunk_read_limit,
+  jlong pass_read_limit,
+  jlong output_granularity,
+  jobjectArray filter_col_names,
+  jlong buffer,
+  jlong buffer_length,
+  jboolean using_numpy_Types,
+  jint unit,
+  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   output_granularity,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ORCChunkedReader_hasNext(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    return reader_ptr->has_next();
+  }
+  CATCH_STD(env, false);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ORCChunkedReader_readChunk(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
+    return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
+  }
+  CATCH_STD(env, nullptr);
+}
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ORCChunkedReader_close(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::chunked_parquet_reader *>(handle);
+    delete reinterpret_cast<cudf::io::chunked_orc_reader*>(handle);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 8fb7df78c09..30a04e37d2c 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
 
-#include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
@@ -33,90 +34,96 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
+#include <arrow/api.h>
+
+#include <algorithm>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, jclass,
-                                                                  jlong j_initial_val, jlong j_step,
-                                                                  jint row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(
+  JNIEnv* env, jclass, jlong j_initial_val, jlong j_step, jint row_count)
+{
   JNI_NULL_CHECK(env, j_initial_val, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto initial_val = reinterpret_cast<cudf::scalar const *>(j_initial_val);
-    auto step = reinterpret_cast<cudf::scalar const *>(j_step);
-    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step) :
-                                   cudf::sequence(row_count, *initial_val));
+    auto initial_val = reinterpret_cast<cudf::scalar const*>(j_initial_val);
+    auto step        = reinterpret_cast<cudf::scalar const*>(j_step);
+    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step)
+                                 : cudf::sequence(row_count, *initial_val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env, jclass,
-                                                                   jlong j_start_handle,
-                                                                   jlong j_size_handle,
-                                                                   jlong j_step_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(
+  JNIEnv* env, jclass, jlong j_start_handle, jlong j_size_handle, jlong j_step_handle)
+{
   JNI_NULL_CHECK(env, j_start_handle, "start is null", 0);
   JNI_NULL_CHECK(env, j_size_handle, "size is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto start = reinterpret_cast<cudf::column_view const *>(j_start_handle);
-    auto size = reinterpret_cast<cudf::column_view const *>(j_size_handle);
-    auto step = reinterpret_cast<cudf::column_view const *>(j_step_handle);
+    auto start = reinterpret_cast<cudf::column_view const*>(j_start_handle);
+    auto size  = reinterpret_cast<cudf::column_view const*>(j_size_handle);
+    auto step  = reinterpret_cast<cudf::column_view const*>(j_step_handle);
     auto ret =
-        step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
+      step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
     return release_as_jlong(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
-    JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj,
-    jobject j_validity_obj, jobject j_offsets_obj) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint j_type,
+                                                                   jlong j_col_length,
+                                                                   jlong j_null_count,
+                                                                   jobject j_data_obj,
+                                                                   jobject j_validity_obj,
+                                                                   jobject j_offsets_obj)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
     // not all the buffers are used for all types
-    void const *data_address = 0;
-    int data_length = 0;
+    void const* data_address = 0;
+    int data_length          = 0;
     if (j_data_obj != 0) {
       data_address = env->GetDirectBufferAddress(j_data_obj);
-      data_length = env->GetDirectBufferCapacity(j_data_obj);
+      data_length  = env->GetDirectBufferCapacity(j_data_obj);
     }
-    void const *validity_address = 0;
-    int validity_length = 0;
+    void const* validity_address = 0;
+    int validity_length          = 0;
     if (j_validity_obj != 0) {
       validity_address = env->GetDirectBufferAddress(j_validity_obj);
-      validity_length = env->GetDirectBufferCapacity(j_validity_obj);
+      validity_length  = env->GetDirectBufferCapacity(j_validity_obj);
     }
-    void const *offsets_address = 0;
-    int offsets_length = 0;
+    void const* offsets_address = 0;
+    int offsets_length          = 0;
     if (j_offsets_obj != 0) {
       offsets_address = env->GetDirectBufferAddress(j_offsets_obj);
-      offsets_length = env->GetDirectBufferCapacity(j_offsets_obj);
+      offsets_length  = env->GetDirectBufferCapacity(j_offsets_obj);
     }
     auto data_buffer =
-        arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
-    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address),
+      arrow::Buffer::Wrap(static_cast<const char*>(data_address), static_cast<int>(data_length));
+    auto null_buffer    = arrow::Buffer::Wrap(static_cast<const char*>(validity_address),
                                            static_cast<int>(validity_length));
-    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address),
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char*>(offsets_address),
                                               static_cast<int>(offsets_length));
 
     std::shared_ptr<arrow::Array> arrow_array;
     switch (n_type) {
       case cudf::type_id::DECIMAL32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0);
         break;
       case cudf::type_id::DECIMAL64:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0);
         break;
       case cudf::type_id::STRUCT:
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0);
@@ -125,23 +132,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0);
         break;
       case cudf::type_id::DICTIONARY32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                      "Don't support converting DICTIONARY32 yet", 0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0);
         break;
       case cudf::type_id::STRING:
-        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer,
-                                                           data_buffer, null_buffer, j_null_count);
+        arrow_array = std::make_shared<arrow::StringArray>(
+          j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count);
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer,
-                                                   j_null_count);
+        arrow_array = cudf::detail::to_arrow_array(
+          n_type, j_col_length, data_buffer, null_buffer, j_null_count);
     }
-    auto name_and_type = arrow::field("col", arrow_array->type());
+    auto name_and_type                                = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
-    std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
+    std::shared_ptr<arrow::Schema> schema             = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
-        arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
+      arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
     auto retCols = cudf::from_arrow(*(arrow_table))->release();
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
@@ -151,135 +158,155 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong separator, jlong narep,
-    jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv* env,
+                                                     jclass,
+                                                     jlongArray column_handles,
+                                                     jlong separator,
+                                                     jlong narep,
+                                                     jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(cudf::strings::concatenate(
-        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
+      cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv* env,
+                                                           jclass,
+                                                           jlongArray column_handles,
+                                                           jlong sep_handle,
+                                                           jlong separator_narep,
+                                                           jlong col_narep,
+                                                           jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    auto column_views = n_cudf_columns.get_dereferenced();
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    auto column_views         = n_cudf_columns.get_dereferenced();
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::concatenate(cudf::table_view(column_views),
-                                                       strings_column, separator_narep_scalar,
-                                                       col_narep_scalar, null_policy));
+                                                       strings_column,
+                                                       separator_narep_scalar,
+                                                       col_narep_scalar,
+                                                       null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv* env,
+                                                                         jclass,
                                                                          jlongArray column_handles,
-                                                                         jboolean ignore_null) {
+                                                                         jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                     cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                   : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(
-        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
+      cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
-                                                                  jlongArray handles, jlong j_type,
-                                                                  jint scale, jlong row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(
+  JNIEnv* env, jobject j_object, jlongArray handles, jlong j_type, jint scale, jlong row_count)
+{
   using ScalarType = cudf::scalar_type_t<cudf::size_type>;
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
     auto children_vector = children.get_dereferenced();
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+    auto zero            = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     zero->set_valid_async(true);
-    static_cast<ScalarType *>(zero.get())->set_value(0);
+    static_cast<ScalarType*>(zero.get())->set_value(0);
 
     if (children.size() == 0) {
       // special case because cudf::interleave_columns does not support no columns
-      auto offsets = cudf::make_column_from_scalar(*zero, row_count + 1);
+      auto offsets                = cudf::make_column_from_scalar(*zero, row_count + 1);
       cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
-      auto empty_col = cudf::make_empty_column(n_data_type);
+      auto empty_col              = cudf::make_empty_column(n_data_type);
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
     } else {
       auto count = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
       count->set_valid_async(true);
-      static_cast<ScalarType *>(count.get())->set_value(children.size());
+      static_cast<ScalarType*>(count.get())->set_value(children.size());
 
       std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *count);
       auto data_col = cudf::interleave_columns(cudf::table_view(children_vector));
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
     }
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeListFromOffsets(
-    JNIEnv *env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count) {
+  JNIEnv* env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count)
+{
   JNI_NULL_CHECK(env, child_handle, "child_handle is null", 0)
   JNI_NULL_CHECK(env, offsets_handle, "offsets_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const child_cv = reinterpret_cast<cudf::column_view const *>(child_handle);
-    auto const offsets_cv = reinterpret_cast<cudf::column_view const *>(offsets_handle);
+    auto const child_cv   = reinterpret_cast<cudf::column_view const*>(child_handle);
+    auto const offsets_cv = reinterpret_cast<cudf::column_view const*>(offsets_handle);
     CUDF_EXPECTS(offsets_cv->type().id() == cudf::type_id::INT32,
                  "Input offsets does not have type INT32.");
 
-    return release_as_jlong(cudf::make_lists_column(
-        static_cast<cudf::size_type>(row_count), std::make_unique<cudf::column>(*offsets_cv),
-        std::make_unique<cudf::column>(*child_cv), 0, {}));
+    return release_as_jlong(cudf::make_lists_column(static_cast<cudf::size_type>(row_count),
+                                                    std::make_unique<cudf::column>(*offsets_cv),
+                                                    std::make_unique<cudf::column>(*child_cv),
+                                                    0,
+                                                    {}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_scalar,
-                                                                    jint row_count) {
+                                                                    jint row_count)
+{
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
+    auto scalar_val = reinterpret_cast<cudf::scalar const*>(j_scalar);
     if (scalar_val->type().id() == cudf::type_id::STRUCT && row_count == 0) {
       // Specialize the creation of empty struct column, since libcudf doesn't support it.
-      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const *>(j_scalar);
-      auto children = cudf::empty_like(struct_scalar->view())->release();
-      auto mask_buffer = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
+      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const*>(j_scalar);
+      auto children      = cudf::empty_like(struct_scalar->view())->release();
+      auto mask_buffer   = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
       return release_as_jlong(
-          cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
+        cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
     } else {
       return release_as_jlong(cudf::make_column_from_scalar(*scalar_val, row_count));
     }
@@ -287,34 +314,37 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env, jclass clazz,
-                                                                     jlongArray column_handles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "input columns are null", 0);
   using cudf::column;
   using cudf::column_view;
   try {
     cudf::jni::auto_set_device(env);
     auto columns =
-        cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
+      cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
     return release_as_jlong(
-        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource()) :
-                          cudf::concatenate(columns));
+      is_lists_column
+        ? cudf::lists::detail::concatenate(
+            columns, cudf::get_default_stream(), rmm::mr::get_current_device_resource())
+        : cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobject j_object,
-                                                              jlongArray column_handles,
-                                                              jint hash_function_id, jint seed) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
     auto column_views =
-        cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
-    return release_as_jlong(cudf::hash(cudf::table_view{column_views},
-                                       static_cast<cudf::hash_id>(hash_function_id), seed));
+      cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+    return release_as_jlong(cudf::hashing::md5(cudf::table_view{column_views}));
   }
   CATCH_STD(env, 0);
 }
@@ -325,46 +355,50 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje
 // only be called from the CudfColumn child class.
 ////////
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::column *>(handle);
+    delete reinterpret_cast<cudf::column*>(handle);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
                                                                                  jlong handle,
-                                                                                 jint null_count) {
+                                                                                 jint null_count)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     column->set_null_count(null_count);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv* env,
                                                                              jobject j_object,
-                                                                             jlong handle) {
+                                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return ptr_as_jlong(new cudf::column_view{*column});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv* env,
+                                                                             jclass,
                                                                              jint j_type,
-                                                                             jint scale) {
-
+                                                                             jint scale)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
@@ -373,15 +407,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNI
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
-                                                                                 jlong handle) {
+                                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index dd3859a4160..086d4672788 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -15,9 +15,11 @@
  */
 
 #include "ColumnViewJni.hpp"
-#include <numeric>
 
-#include <jni.h>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+#include "maps_column_view.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
@@ -81,17 +83,17 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
-#include "maps_column_view.hpp"
+#include <jni.h>
+
+#include <numeric>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 namespace {
 
-std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
+std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu)
+{
   if (should_pad_for_cpu) {
     constexpr std::size_t ALIGN = sizeof(std::max_align_t);
     return (size + (ALIGN - 1)) & ~(ALIGN - 1);
@@ -100,9 +102,10 @@ std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
   }
 }
 
-std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pad_for_cpu) {
+std::size_t calc_device_memory_size(cudf::column_view const& view, bool const pad_for_cpu)
+{
   std::size_t total = 0;
-  auto row_count = view.size();
+  auto row_count    = view.size();
 
   if (view.nullable()) {
     total += pad_size(cudf::bitmask_allocation_size_bytes(row_count), pad_for_cpu);
@@ -116,249 +119,274 @@ std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pa
     total += pad_size(scv.chars_size(cudf::get_default_stream()), pad_for_cpu);
   }
 
-  return std::accumulate(view.child_begin(), view.child_end(), total,
-                         [pad_for_cpu](std::size_t t, cudf::column_view const &v) {
+  return std::accumulate(view.child_begin(),
+                         view.child_end(),
+                         total,
+                         [pad_for_cpu](std::size_t t, cudf::column_view const& v) {
                            return t + calc_device_memory_size(v, pad_for_cpu);
                          });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_upper(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_lower(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_scalar) {
+                                                                          jlong j_scalar)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
-    auto val = reinterpret_cast<cudf::scalar *>(j_scalar);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
+    auto val              = reinterpret_cast<cudf::scalar*>(j_scalar);
     return release_as_jlong(cudf::replace_nulls(col, *val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_replace_col) {
+                                                                          jlong j_replace_col)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_replace_col, "replacement column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col);
-    auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
+    auto col          = reinterpret_cast<cudf::column_view*>(j_col);
+    auto replacements = reinterpret_cast<cudf::column_view*>(j_replace_col);
     return release_as_jlong(cudf::replace_nulls(*col, *replacements));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jboolean is_preceding) {
+                                                                          jboolean is_preceding)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
     return release_as_jlong(cudf::replace_nulls(
-        col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
+      col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_col,
-                                                                    jboolean nulls_included) {
+                                                                    jboolean nulls_included)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
 
     return cudf::distinct_count(
-        col, nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
-        cudf::nan_policy::NAN_IS_VALID);
+      col,
+      nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
+      cudf::nan_policy::NAN_IS_VALID);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec  = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec  = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_vec = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec     = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec    = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_vec   = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar  = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv *env, jclass, jlong from,
-                                                                  jint index) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong from,
+                                                                  jint index)
+{
   JNI_NULL_CHECK(env, from, "from column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto from_vec = reinterpret_cast<cudf::column_view *>(from);
+    auto from_vec = reinterpret_cast<cudf::column_view*>(from);
     return release_as_jlong(cudf::get_element(*from_vec, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclass, jlong j_col_view,
-                                                              jlong j_agg, jint j_dtype,
-                                                              jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
+    auto col                  = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg                  = reinterpret_cast<cudf::aggregation*>(j_agg);
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation *>(agg), out_dtype));
+      cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation*>(agg), out_dtype));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(
-    JNIEnv *env, jclass, jlong j_data_view, jlong j_offsets_view, jlong j_agg,
-    jboolean include_nulls, jint j_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_data_view,
+                                                                       jlong j_offsets_view,
+                                                                       jlong j_agg,
+                                                                       jboolean include_nulls,
+                                                                       jint j_dtype,
+                                                                       jint scale)
+{
   JNI_NULL_CHECK(env, j_data_view, "data column view is null", 0);
   JNI_NULL_CHECK(env, j_offsets_view, "offsets column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto data = reinterpret_cast<cudf::column_view *>(j_data_view);
-    auto offsets = reinterpret_cast<cudf::column_view *>(j_offsets_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto s_agg = dynamic_cast<cudf::segmented_reduce_aggregation *>(agg);
+    auto data    = reinterpret_cast<cudf::column_view*>(j_data_view);
+    auto offsets = reinterpret_cast<cudf::column_view*>(j_offsets_view);
+    auto agg     = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto s_agg   = dynamic_cast<cudf::segmented_reduce_aggregation*>(agg);
     JNI_ARG_CHECK(env, s_agg != nullptr, "agg is not a cudf::segmented_reduce_aggregation", 0)
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
+      cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedGather(
-    JNIEnv *env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds) {
+  JNIEnv* env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds)
+{
   JNI_NULL_CHECK(env, source_column, "source column view is null", 0);
   JNI_NULL_CHECK(env, gather_map_list, "gather map is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const &src_col =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(source_column));
-    auto const &gather_map =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(gather_map_list));
-    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY :
-                                                  cudf::out_of_bounds_policy::DONT_CHECK;
+    auto const& src_col =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(source_column));
+    auto const& gather_map =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(gather_map_list));
+    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK;
     return release_as_jlong(cudf::lists::segmented_gather(src_col, gather_map, out_bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass, jlong j_col_view,
-                                                            jlong j_agg, jboolean is_inclusive,
-                                                            jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jboolean is_inclusive, jboolean include_nulls)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto scan_type = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
+    auto col         = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg         = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto scan_type   = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     return release_as_jlong(
-        cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation *>(agg), scan_type, null_policy));
+      cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation*>(agg), scan_type, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *env, jclass clazz,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv* env,
+                                                                        jclass clazz,
                                                                         jlong input_column,
-                                                                        jlong percentiles_column) {
+                                                                        jlong percentiles_column)
+{
   JNI_NULL_CHECK(env, input_column, "input_column native handle is null", 0);
   JNI_NULL_CHECK(env, percentiles_column, "percentiles_column native handle is null", 0);
   try {
@@ -366,63 +394,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *
     using tdigest_column_view = cudf::tdigest::tdigest_column_view;
     jni::auto_set_device(env);
     auto const tdigest_view =
-        tdigest_column_view{structs_column_view{*reinterpret_cast<column_view *>(input_column)}};
-    auto const p_percentiles = reinterpret_cast<column_view *>(percentiles_column);
+      tdigest_column_view{structs_column_view{*reinterpret_cast<column_view*>(input_column)}};
+    auto const p_percentiles = reinterpret_cast<column_view*>(percentiles_column);
     return release_as_jlong(percentile_approx(tdigest_view, *p_percentiles));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(JNIEnv *env, jclass clazz,
-                                                                jlong input_column,
-                                                                jint quantile_method,
-                                                                jdoubleArray jquantiles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(
+  JNIEnv* env, jclass clazz, jlong input_column, jint quantile_method, jdoubleArray jquantiles)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jdoubleArray native_quantiles(env, jquantiles);
     std::vector<double> quantiles(native_quantiles.data(),
                                   native_quantiles.data() + native_quantiles.size());
-    cudf::column_view *n_input_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_input_column     = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::interpolation n_quantile_method = static_cast<cudf::interpolation>(quantile_method);
     return release_as_jlong(cudf::quantile(*n_input_column, quantiles, n_quantile_method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
-    JNIEnv *env, jclass clazz, jlong input_col, jlong default_output_col, jint min_periods,
-    jlong agg_ptr, jint preceding, jint following, jlong preceding_col, jlong following_col) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong input_col,
+                                                                     jlong default_output_col,
+                                                                     jint min_periods,
+                                                                     jlong agg_ptr,
+                                                                     jint preceding,
+                                                                     jint following,
+                                                                     jlong preceding_col,
+                                                                     jlong following_col)
+{
   JNI_NULL_CHECK(env, input_col, "native handle is null", 0);
   JNI_NULL_CHECK(env, agg_ptr, "aggregation handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_input_col = reinterpret_cast<cudf::column_view *>(input_col);
-    cudf::column_view *n_default_output_col =
-        reinterpret_cast<cudf::column_view *>(default_output_col);
-    cudf::column_view *n_preceding_col = reinterpret_cast<cudf::column_view *>(preceding_col);
-    cudf::column_view *n_following_col = reinterpret_cast<cudf::column_view *>(following_col);
-    cudf::rolling_aggregation *agg =
-        dynamic_cast<cudf::rolling_aggregation *>(reinterpret_cast<cudf::aggregation *>(agg_ptr));
+    cudf::column_view* n_input_col = reinterpret_cast<cudf::column_view*>(input_col);
+    cudf::column_view* n_default_output_col =
+      reinterpret_cast<cudf::column_view*>(default_output_col);
+    cudf::column_view* n_preceding_col = reinterpret_cast<cudf::column_view*>(preceding_col);
+    cudf::column_view* n_following_col = reinterpret_cast<cudf::column_view*>(following_col);
+    cudf::rolling_aggregation* agg =
+      dynamic_cast<cudf::rolling_aggregation*>(reinterpret_cast<cudf::aggregation*>(agg_ptr));
     JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", 0);
 
     std::unique_ptr<cudf::column> ret;
     if (n_default_output_col != nullptr) {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        CUDF_FAIL("A default output column is not currently supported with variable length "
-                  "preceding and following");
+        CUDF_FAIL(
+          "A default output column is not currently supported with variable length "
+          "preceding and following");
         // ret = cudf::rolling_window(*n_input_col, *n_default_output_col,
         //        *n_preceding_col, *n_following_col, min_periods, agg);
       } else {
-        ret = cudf::rolling_window(*n_input_col, *n_default_output_col, preceding, following,
-                                   min_periods, *agg);
+        ret = cudf::rolling_window(
+          *n_input_col, *n_default_output_col, preceding, following, min_periods, *agg);
       }
 
     } else {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        ret = cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods,
-                                   *agg);
+        ret =
+          cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods, *agg);
       } else {
         ret = cudf::rolling_window(*n_input_col, preceding, following, min_periods, *agg);
       }
@@ -432,301 +467,336 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray slice_indices) {
+                                                                  jintArray slice_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, slice_indices, "slice indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_slice_indices(env, slice_indices);
     std::vector<cudf::size_type> indices(n_slice_indices.begin(), n_slice_indices.end());
 
     std::vector<cudf::column_view> result = cudf::slice(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jint index) {
+                                                                          jint index)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong column_view,
-                                                                           jlong indices_view) {
+                                                                           jlong indices_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, indices_view, "indices is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *indices = reinterpret_cast<cudf::column_view *>(indices_view);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* indices = reinterpret_cast<cudf::column_view*>(indices_view);
+    cudf::column_view* cv      = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, *indices));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv *env, jclass,
-                                                                          jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(column_view);
     return release_as_jlong(cudf::lists::distinct(cudf::lists_column_view{*input_cv}));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKeysValues(
-    JNIEnv *env, jclass, jlong keys_vals_handle) {
+  JNIEnv* env, jclass, jlong keys_vals_handle)
+{
   JNI_NULL_CHECK(env, keys_vals_handle, "keys_vals_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(keys_vals_handle);
-    JNI_ARG_CHECK(env, input_cv->type().id() == cudf::type_id::LIST,
-                  "Input column is not a lists column.", 0);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(keys_vals_handle);
+    JNI_ARG_CHECK(
+      env, input_cv->type().id() == cudf::type_id::LIST, "Input column is not a lists column.", 0);
 
     auto const lists_keys_vals = cudf::lists_column_view(*input_cv);
-    auto const keys_vals = lists_keys_vals.child();
-    JNI_ARG_CHECK(env, keys_vals.type().id() == cudf::type_id::STRUCT,
-                  "Input column has child that is not a structs column.", 0);
-    JNI_ARG_CHECK(env, keys_vals.num_children() == 2,
-                  "Input column has child that does not have 2 children.", 0);
+    auto const keys_vals       = lists_keys_vals.child();
+    JNI_ARG_CHECK(env,
+                  keys_vals.type().id() == cudf::type_id::STRUCT,
+                  "Input column has child that is not a structs column.",
+                  0);
+    JNI_ARG_CHECK(env,
+                  keys_vals.num_children() == 2,
+                  "Input column has child that does not have 2 children.",
+                  0);
 
     return release_as_jlong(
-        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
+      cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_handle,
-                                                                    jboolean ignore_null) {
+                                                                    jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                           cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                         : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto const input_cv    = reinterpret_cast<cudf::column_view const*>(input_handle);
     return release_as_jlong(cudf::lists::concatenate_list_elements(*input_cv, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong column_view,
-                                                                    jlong lookup_key) {
+                                                                    jlong lookup_key)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::scalar *lookup_scalar = reinterpret_cast<cudf::scalar *>(lookup_key);
+    cudf::scalar* lookup_scalar = reinterpret_cast<cudf::scalar*>(lookup_key);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv *env, jclass,
-                                                                         jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto cv  = reinterpret_cast<cudf::column_view*>(column_view);
     auto lcv = cudf::lists_column_view{*cv};
     return release_as_jlong(cudf::lists::contains_nulls(lcv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jlong lookup_key_cv) {
+                                                                          jlong lookup_key_cv)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key_cv, "lookup column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::column_view *lookup_cv = reinterpret_cast<cudf::column_view *>(lookup_key_cv);
+    cudf::column_view* lookup_cv = reinterpret_cast<cudf::column_view*>(lookup_key_cv);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_key,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_key, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const *>(lookup_key);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const*>(lookup_key);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_scalar, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_keys,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_keys, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_column = reinterpret_cast<cudf::column_view const *>(lookup_keys);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_column = reinterpret_cast<cudf::column_view const*>(lookup_keys);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_column, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass,
-                                                                    jlong column_view,
-                                                                    jboolean is_descending,
-                                                                    jboolean is_null_smallest) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(
+  JNIEnv* env, jclass, jlong column_view, jboolean is_descending, jboolean is_null_smallest)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto sort_order = is_descending ? cudf::order::DESCENDING : cudf::order::ASCENDING;
     auto null_order = is_null_smallest ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    auto *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto* cv        = reinterpret_cast<cudf::column_view*>(column_view);
     return release_as_jlong(
-        cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
+      cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(handle);
+    auto const cv = reinterpret_cast<cudf::column_view const*>(handle);
     return release_as_jlong(cudf::jni::generate_list_offsets(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong lhs_handle,
-                                                                        jlong rhs_handle) {
+                                                                        jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    auto overlap_result =
-        cudf::lists::have_overlap(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                  cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
+    auto const lhs      = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs      = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    auto overlap_result = cudf::lists::have_overlap(cudf::lists_column_view{*lhs},
+                                                    cudf::lists_column_view{*rhs},
+                                                    cudf::null_equality::UNEQUAL,
+                                                    cudf::nan_equality::ALL_EQUAL);
     cudf::jni::post_process_list_overlap(*lhs, *rhs, overlap_result);
     return release_as_jlong(overlap_result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong lhs_handle,
-                                                                              jlong rhs_handle) {
+                                                                              jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::intersect_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::intersect_distinct(cudf::lists_column_view{*lhs},
+                                                            cudf::lists_column_view{*rhs},
+                                                            cudf::null_equality::EQUAL,
+                                                            cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong lhs_handle,
-                                                                          jlong rhs_handle) {
+                                                                          jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(
-        cudf::lists::union_distinct(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                    cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::union_distinct(cudf::lists_column_view{*lhs},
+                                                        cudf::lists_column_view{*rhs},
+                                                        cudf::null_equality::EQUAL,
+                                                        cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv* env,
+                                                                               jclass,
                                                                                jlong lhs_handle,
-                                                                               jlong rhs_handle) {
+                                                                               jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::difference_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::difference_distinct(cudf::lists_column_view{*lhs},
+                                                             cudf::lists_column_view{*rhs},
+                                                             cudf::null_equality::EQUAL,
+                                                             cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv *env, jclass,
-                                                                             jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong input_handle)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
 
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_handle);
     switch (input->type().id()) {
       case cudf::type_id::STRING:
         return release_as_jlong(cudf::strings::reverse(cudf::strings_column_view{*input}));
       case cudf::type_id::LIST:
         return release_as_jlong(cudf::lists::reverse(cudf::lists_column_view{*input}));
       default:
-        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                      "A column of type string or list is required for reverse()", 0);
+        JNI_THROW_NEW(env,
+                      "java/lang/IllegalArgumentException",
+                      "A column of type string or list is required for reverse()",
+                      0);
     }
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
-                                                                        jlong input_handle,
-                                                                        jstring delimiter_obj,
-                                                                        jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -734,26 +804,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result = cudf::strings::split(strings_column, cudf::string_scalar{delimiter}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong input_handle,
+                                                                          jstring pattern_obj,
+                                                                          jint regex_flags,
+                                                                          jint capture_groups,
+                                                                          jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -761,30 +836,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
-    auto result = cudf::strings::split_re(strings_column, *regex_prog, max_split);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto result               = cudf::strings::split_re(strings_column, *regex_prog, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong input_handle,
-                                                                         jstring delimiter_obj,
-                                                                         jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -792,27 +866,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result =
-        cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
+      cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_handle,
+                                                                           jstring pattern_obj,
+                                                                           jint regex_flags,
+                                                                           jint capture_groups,
+                                                                           jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -820,99 +899,108 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
     auto result = cudf::strings::split_record_re(strings_column, *regex_prog, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray split_indices) {
+                                                                  jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
     std::vector<cudf::size_type> indices(n_split_indices.begin(), n_split_indices.end());
 
     std::vector<cudf::column_view> result = cudf::split(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column_view{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column_view{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env, jclass clazz,
-                                                                     jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::lists::count_elements(cudf::lists_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env, jclass clazz,
-                                                                   jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv* env,
+                                                                   jclass clazz,
+                                                                   jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_characters(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv *env, jclass clazz,
-                                                                 jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_bytes(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv *env, jclass clazz,
-                                                                  jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(view_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(view_handle);
     return release_as_jlong(cudf::strings::code_points(cudf::strings_column_view{*input}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv *env, jclass clazz,
-                                                                         jlong old_values_handle,
-                                                                         jlong new_values_handle,
-                                                                         jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(
+  JNIEnv* env, jclass clazz, jlong old_values_handle, jlong new_values_handle, jlong input_handle)
+{
   JNI_NULL_CHECK(env, old_values_handle, "values column is null", 0);
   JNI_NULL_CHECK(env, new_values_handle, "replace column is null", 0);
   JNI_NULL_CHECK(env, input_handle, "input column is null", 0);
@@ -922,230 +1010,253 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    column_view *input_column = reinterpret_cast<column_view *>(input_handle);
-    column_view *old_values_column = reinterpret_cast<column_view *>(old_values_handle);
-    column_view *new_values_column = reinterpret_cast<column_view *>(new_values_handle);
+    column_view* input_column      = reinterpret_cast<column_view*>(input_handle);
+    column_view* old_values_column = reinterpret_cast<column_view*>(old_values_handle);
+    column_view* new_values_column = reinterpret_cast<column_view*>(new_values_handle);
     return release_as_jlong(
-        cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
+      cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv *env, jclass,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_null(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv *env, jclass,
-                                                                       jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_valid(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv *env, jclass,
-                                                                   jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_not_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong input_ptr,
-                                                                      jint int_op) {
+                                                                      jint int_op)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    cudf::unary_operator op = static_cast<cudf::unary_operator>(int_op);
+    cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::unary_operator op  = static_cast<cudf::unary_operator>(int_op);
     return release_as_jlong(cudf::unary_operation(*input, op));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(JNIEnv *env, jclass, jlong input_ptr,
-                                                             jint decimal_places,
-                                                             jint rounding_method) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(
+  JNIEnv* env, jclass, jlong input_ptr, jint decimal_places, jint rounding_method)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    cudf::column_view* input     = reinterpret_cast<cudf::column_view*>(input_ptr);
     cudf::rounding_method method = static_cast<cudf::rounding_method>(rounding_method);
     return release_as_jlong(cudf::round(*input, decimal_places, method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_day(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_hour(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_minute(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_second(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv *env, jclass,
-                                                               jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_weekday(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv *env, jclass,
-                                                                      jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::last_day_of_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv *env, jclass,
-                                                                 jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::day_of_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv *env, jclass,
-                                                                     jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_quarter(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong ts_ptr,
-                                                                            jlong months_ptr) {
+                                                                            jlong months_ptr)
+{
   JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0);
   JNI_NULL_CHECK(env, months_ptr, "months is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *ts = reinterpret_cast<cudf::column_view *>(ts_ptr);
-    const cudf::column_view *months = reinterpret_cast<cudf::column_view *>(months_ptr);
+    const cudf::column_view* ts     = reinterpret_cast<cudf::column_view*>(ts_ptr);
+    const cudf::column_view* months = reinterpret_cast<cudf::column_view*>(months_ptr);
     return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv *env, jclass,
-                                                                  jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::is_leap_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclass, jlong handle,
-                                                              jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
-    if (n_data_type == column->type()) {
-      return ptr_as_jlong(new cudf::column(*column));
-    }
+    if (n_data_type == column->type()) { return ptr_as_jlong(new cudf::column(*column)); }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
         case cudf::type_id::BOOL8: {
-          auto const true_scalar = cudf::string_scalar("true");
+          auto const true_scalar  = cudf::string_scalar("true");
           auto const false_scalar = cudf::string_scalar("false");
           return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar));
         }
@@ -1195,26 +1306,30 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
       if (n_data_type.id() == cudf::type_id::TIMESTAMP_DAYS) {
         if (column->type().id() != cudf::type_id::INT32) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to TIMESTAMP_DAYS requires INT32", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to TIMESTAMP_DAYS requires INT32",
+                        0);
         }
       } else {
         if (column->type().id() != cudf::type_id::INT64) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to non-day timestamp requires INT64", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to non-day timestamp requires INT64",
+                        0);
         }
       }
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(n_data_type);
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(n_data_type);
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else if (cudf::is_timestamp(column->type()) && cudf::is_numeric(n_data_type)) {
       // This is a temporary workaround to allow Java to cast from timestamp types to integral types
       // without forcing an intermediate duration column to be manifested.  Ultimately this style of
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(column->type());
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(column->type());
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else {
       return release_as_jlong(cudf::cast(*column, n_data_type));
@@ -1223,25 +1338,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv *env, jclass, jlong handle,
-                                                                 jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
     return ptr_as_jlong(new cudf::column_view{cudf::bit_cast(*column, n_data_type)});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv* env,
+                                                                    jobject j_object,
                                                                     jlong handle,
-                                                                    jboolean endianness_config) {
+                                                                    jboolean endianness_config)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::flip_endianness config(static_cast<cudf::flip_endianness>(endianness_config));
     return release_as_jlong(byte_cast(*column, config));
   }
@@ -1249,78 +1367,86 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env,
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringTimestampToTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jint time_unit, jstring formatObj) {
+  JNIEnv* env, jobject j_object, jlong handle, jint time_unit, jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
 
     return release_as_jlong(cudf::strings::to_timestamps(
-        strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
+      strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong handle,
-                                                                   jstring formatObj) {
+                                                                   jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::is_timestamp(strings_column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jstring j_format) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(JNIEnv* env,
+                                                                                  jobject j_object,
+                                                                                  jlong handle,
+                                                                                  jstring j_format)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, j_format, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, j_format);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::from_timestamps(*column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv *env,
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong j_view_handle,
-                                                                         jlong j_scalar_handle) {
+                                                                         jlong j_scalar_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "haystack vector is null", false);
   JNI_NULL_CHECK(env, j_scalar_handle, "scalar needle is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
-    cudf::scalar *scalar = reinterpret_cast<cudf::scalar *>(j_scalar_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
+    cudf::scalar* scalar           = reinterpret_cast<cudf::scalar*>(j_scalar_handle);
 
     return cudf::contains(*column_view, *scalar);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_values_handle,
-                                                                      jlong j_search_space_handle) {
+                                                                      jlong j_search_space_handle)
+{
   JNI_NULL_CHECK(env, j_values_handle, "values vector is null", false);
   JNI_NULL_CHECK(env, j_search_space_handle, "search_space vector is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    auto const search_space_ptr =
-        reinterpret_cast<cudf::column_view const *>(j_search_space_handle);
-    auto const values_ptr = reinterpret_cast<cudf::column_view const *>(j_values_handle);
+    auto const search_space_ptr = reinterpret_cast<cudf::column_view const*>(j_search_space_handle);
+    auto const values_ptr       = reinterpret_cast<cudf::column_view const*>(j_values_handle);
 
     // The C++ API `cudf::contains` requires that the search space is the first parameter.
     return release_as_jlong(cudf::contains(*search_space_ptr, *values_ptr));
@@ -1328,141 +1454,149 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(JNIEnv *env, jobject j_object,
-                                                                 jlong handle, jstring j_udf,
-                                                                 jboolean j_is_ptx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(
+  JNIEnv* env, jobject j_object, jlong handle, jstring j_udf, jboolean j_is_ptx)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::jni::native_jstring n_j_udf(env, j_udf);
     std::string n_udf(n_j_udf.get());
     return release_as_jlong(
-        cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
+      cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv* env,
                                                                        jobject j_object,
                                                                        jlong j_view_handle,
-                                                                       jlong comp_string) {
+                                                                       jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::starts_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv* env,
+                                                                     jobject j_object,
                                                                      jlong j_view_handle,
-                                                                     jlong comp_string) {
+                                                                     jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::ends_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_view_handle,
-                                                                      jlong comp_string) {
+                                                                      jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::contains(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv* env,
+                                                                 jobject j_object,
                                                                  jlong j_view_handle,
                                                                  jstring pattern_obj,
                                                                  jint regex_flags,
-                                                                 jint capture_groups) {
+                                                                 jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return release_as_jlong(cudf::strings::matches_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong j_view_handle,
                                                                   jstring pattern_obj,
                                                                   jint regex_flags,
-                                                                  jint capture_groups) {
+                                                                  jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const capture = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, capture);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const capture        = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, capture);
     return release_as_jlong(cudf::strings::contains_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
-                                                            jlong j_view_handle, jlong pattern,
-                                                            jlong escapeChar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(
+  JNIEnv* env, jobject j_object, jlong j_view_handle, jlong pattern, jlong escapeChar)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern, "pattern is null", false);
   JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
-    auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
+    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const*>(pattern);
+    auto const escape_scalar  = reinterpret_cast<cudf::string_scalar const*>(escapeChar);
     return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_view,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1476,7 +1610,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
 
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
+        out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1485,30 +1619,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(JNIEnv *env, jclass,
-                                                                            jint int_op,
-                                                                            jint lhs_scale,
-                                                                            jint rhs_scale) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(
+  JNIEnv* env, jclass, jint int_op, jint lhs_scale, jint rhs_scale)
+{
   try {
     // we just return the scale as the types will be the same as the lhs input
-    return cudf::binary_operation_fixed_point_scale(static_cast<cudf::binary_operator>(int_op),
-                                                    lhs_scale, rhs_scale);
+    return cudf::binary_operation_fixed_point_scale(
+      static_cast<cudf::binary_operator>(int_op), lhs_scale, rhs_scale);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_ptr,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_ptr, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_ptr, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    cudf::scalar *rhs = reinterpret_cast<cudf::scalar *>(rhs_ptr);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    cudf::scalar* rhs           = reinterpret_cast<cudf::scalar*>(rhs_ptr);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1520,10 +1652,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto rhsv = cudf::make_column_from_scalar(*rhs, 1);
+      auto rhsv     = cudf::make_column_from_scalar(*rhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
+        out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1532,233 +1664,251 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv *env, jclass,
-                                                                  jlong cv_handle, jint start) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong cv_handle,
+                                                                  jint start)
+{
   JNI_NULL_CHECK(env, cv_handle, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(cv_handle);
+    auto const cv  = reinterpret_cast<cudf::column_view const*>(cv_handle);
     auto const scv = cudf::strings_column_view{*cv};
     return release_as_jlong(cudf::strings::slice_strings(scv, start));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jclass,
-                                                                 jlong column_view, jint start,
-                                                                 jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(
+  JNIEnv* env, jclass, jlong column_view, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     return release_as_jlong(cudf::strings::slice_strings(scv, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong start_column,
-                                                                       jlong end_column) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong start_column, jlong end_column)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, start_column, "column is null", 0);
   JNI_NULL_CHECK(env, end_column, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *sc = reinterpret_cast<cudf::column_view *>(start_column);
-    cudf::column_view *ec = reinterpret_cast<cudf::column_view *>(end_column);
+    cudf::column_view* sc = reinterpret_cast<cudf::column_view*>(start_column);
+    cudf::column_view* ec = reinterpret_cast<cudf::column_view*>(end_column);
     return release_as_jlong(cudf::strings::slice_strings(scv, *sc, *ec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong substring, jint start,
-                                                                       jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(
+  JNIEnv* env, jclass, jlong column_view, jlong substring, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, substring, "target string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(substring);
+    cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(substring);
     return release_as_jlong(cudf::strings::find(scv, *ss_scalar, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env, jclass,
-                                                                     jlong column_view,
-                                                                     jlong target, jlong replace) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(
+  JNIEnv* env, jclass, jlong column_view, jlong target, jlong replace)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, target, "target string scalar is null", 0);
   JNI_NULL_CHECK(env, replace, "replace string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_target = reinterpret_cast<cudf::string_scalar *>(target);
-    cudf::string_scalar *ss_replace = reinterpret_cast<cudf::string_scalar *>(replace);
+    cudf::string_scalar* ss_target  = reinterpret_cast<cudf::string_scalar*>(target);
+    cudf::string_scalar* ss_replace = reinterpret_cast<cudf::string_scalar*>(replace);
     return release_as_jlong(cudf::strings::replace(scv, *ss_target, *ss_replace));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(JNIEnv *env, jclass,
-                                                                          jlong inputs_cv,
-                                                                          jlong targets_cv,
-                                                                          jlong repls_cv) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(
+  JNIEnv* env, jclass, jlong inputs_cv, jlong targets_cv, jlong repls_cv)
+{
   JNI_NULL_CHECK(env, inputs_cv, "column is null", 0);
   JNI_NULL_CHECK(env, targets_cv, "targets string column view is null", 0);
   JNI_NULL_CHECK(env, repls_cv, "repls string column view is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(inputs_cv);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(inputs_cv);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *cvtargets = reinterpret_cast<cudf::column_view *>(targets_cv);
+    cudf::column_view* cvtargets = reinterpret_cast<cudf::column_view*>(targets_cv);
     cudf::strings_column_view scvtargets(*cvtargets);
-    cudf::column_view *cvrepls = reinterpret_cast<cudf::column_view *>(repls_cv);
+    cudf::column_view* cvrepls = reinterpret_cast<cudf::column_view*>(repls_cv);
     cudf::strings_column_view scvrepls(*cvrepls);
     return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong map_column_view,
-                                                                        jlong lookup_keys) {
+                                                                        jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_keys = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv          = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_keys = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view    = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*column_keys));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong map_column_view,
-                                                                 jlong lookup_key) {
+                                                                 jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong map_column_view,
-                                                                       jlong lookup_keys) {
+                                                                       jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_key = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_key = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*column_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong map_column_view,
-                                                                   jlong lookup_key) {
+                                                                   jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(
-    JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags,
-    jint capture_groups, jlong j_repl, jlong j_maxrepl) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_column_view,
+                                                                    jstring j_pattern,
+                                                                    jint regex_flags,
+                                                                    jint capture_groups,
+                                                                    jlong j_repl,
+                                                                    jlong j_maxrepl)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0);
   JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, j_pattern);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto const repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
+    auto const pattern        = cudf::jni::native_jstring(env, j_pattern);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const repl           = reinterpret_cast<cudf::string_scalar const*>(j_repl);
     return release_as_jlong(
-        cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
+      cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv *env, jclass,
-                                                                         jlong j_column_view,
-                                                                         jobjectArray j_patterns,
-                                                                         jlong j_repls) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(
+  JNIEnv* env, jclass, jlong j_column_view, jobjectArray j_patterns, jlong j_repls)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_patterns, "patterns is null", 0);
   JNI_NULL_CHECK(env, j_repls, "repls is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cv = reinterpret_cast<cudf::column_view const*>(j_column_view);
     cudf::strings_column_view scv(*cv);
     cudf::jni::native_jstringArray patterns(env, j_patterns);
-    auto repl_cv = reinterpret_cast<cudf::column_view const *>(j_repls);
+    auto repl_cv = reinterpret_cast<cudf::column_view const*>(j_repls);
     cudf::strings_column_view repl_scv(*repl_cv);
     return release_as_jlong(cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(
-    JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jstring replace_obj) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(JNIEnv* env,
+                                                         jclass,
+                                                         jlong j_column_view,
+                                                         jstring pattern_obj,
+                                                         jint regex_flags,
+                                                         jint capture_groups,
+                                                         jstring replace_obj)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0);
   JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     cudf::jni::native_jstring ss_replace(env, replace_obj);
     return release_as_jlong(
-        cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
+      cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass, jlong column_view,
-                                                             jint j_width) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv* env,
+                                                             jclass,
+                                                             jlong column_view,
+                                                             jint j_width)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
     return release_as_jlong(cudf::strings::zfill(scv, width));
@@ -1766,17 +1916,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass, jlong column_view,
-                                                           jint j_width, jint j_side,
-                                                           jstring fill_char) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(
+  JNIEnv* env, jclass, jlong column_view, jint j_width, jint j_side, jstring fill_char)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, fill_char, "fill_char is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
+    cudf::size_type width         = reinterpret_cast<cudf::size_type>(j_width);
     cudf::strings::side_type side = static_cast<cudf::strings::side_type>(j_side);
     cudf::jni::native_jstring ss_fill(env, fill_char);
     return release_as_jlong(cudf::strings::pad(scv, width, side, ss_fill.get()));
@@ -1784,113 +1933,125 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env, jclass,
-                                                                   jlong column_view,
-                                                                   jint strip_type,
-                                                                   jlong to_strip) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(
+  JNIEnv* env, jclass, jlong column_view, jint strip_type, jlong to_strip)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, to_strip, "to_strip scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::strings::side_type s_striptype = static_cast<cudf::strings::side_type>(strip_type);
-    cudf::string_scalar *ss_tostrip = reinterpret_cast<cudf::string_scalar *>(to_strip);
+    cudf::string_scalar* ss_tostrip      = reinterpret_cast<cudf::string_scalar*>(to_strip);
     return release_as_jlong(cudf::strings::strip(scv, s_striptype, *ss_tostrip));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong j_view_handle,
                                                                       jstring pattern_obj,
                                                                       jint regex_flags,
-                                                                      jint capture_groups) {
+                                                                      jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return cudf::jni::convert_table_for_return(env,
                                                cudf::strings::extract(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(
-    JNIEnv *env, jclass, jlong j_view_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint idx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong j_view_handle,
+                                                                        jstring pattern_obj,
+                                                                        jint regex_flags,
+                                                                        jint capture_groups,
+                                                                        jint idx)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto result = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog) :
-                               cudf::strings::extract_all_record(strings_column, *regex_prog);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto result               = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog)
+                                           : cudf::strings::extract_all_record(strings_column, *regex_prog);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_decode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_encode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong input_column) {
+                                                                             jlong input_column)
+{
   using cudf::column_view;
 
   JNI_NULL_CHECK(env, input_column, "Input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     return release_as_jlong(
-        cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view *>(input_column)));
+      cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view*>(input_column)));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidity(
-    JNIEnv *env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op) {
+  JNIEnv* env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op)
+{
   JNI_NULL_CHECK(env, base_column, "base column native handle is null", 0);
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *original_column = reinterpret_cast<cudf::column_view *>(base_column);
+    cudf::column_view* original_column = reinterpret_cast<cudf::column_view*>(base_column);
     std::unique_ptr<cudf::column> copy(new cudf::column(*original_column));
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
 
@@ -1904,7 +2065,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       case cudf::binary_operator::BITWISE_AND: {
         auto cols = n_cudf_columns.get_dereferenced();
         cols.push_back(copy->view());
-        auto table_view = cudf::table_view{cols};
+        auto table_view                = cudf::table_view{cols};
         auto [new_bitmask, null_count] = cudf::bitmask_and(table_view);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
@@ -1922,9 +2083,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
     auto const copy_cv = copy->view();
-    if (cudf::has_nonempty_nulls(copy_cv)) {
-      copy = cudf::purge_nonempty_nulls(copy_cv);
-    }
+    if (cudf::has_nonempty_nulls(copy_cv)) { copy = cudf::purge_nonempty_nulls(copy_cv); }
 
     return release_as_jlong(copy);
   }
@@ -1932,15 +2091,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
-    JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) {
+  JNIEnv* env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle)
+{
   JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
   JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const exemplar = *reinterpret_cast<cudf::column_view *>(exemplar_handle);
-    auto const validity = *reinterpret_cast<cudf::column_view *>(validity_column_handle);
+    auto const exemplar = *reinterpret_cast<cudf::column_view*>(exemplar_handle);
+    auto const validity = *reinterpret_cast<cudf::column_view*>(validity_column_handle);
     return release_as_jlong(
-        cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
+      cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
   }
   CATCH_STD(env, 0);
 }
@@ -1950,23 +2110,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsVa
 // should typically only be called from the CudfColumn inner class.
 ////////
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
-    JNIEnv *env, jclass, jint j_type, jint scale, jlong j_data, jlong j_data_size, jlong j_offset,
-    jlong j_valid, jint j_null_count, jint size, jlongArray j_children) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(JNIEnv* env,
+                                                                          jclass,
+                                                                          jint j_type,
+                                                                          jint scale,
+                                                                          jlong j_data,
+                                                                          jlong j_data_size,
+                                                                          jlong j_offset,
+                                                                          jlong j_valid,
+                                                                          jint j_null_count,
+                                                                          jint size,
+                                                                          jlongArray j_children)
+{
   try {
     using cudf::column_view;
     cudf::jni::auto_set_device(env);
-    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
+    cudf::type_id n_type        = static_cast<cudf::type_id>(j_type);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
 
-    void *data = reinterpret_cast<void *>(j_data);
-    cudf::bitmask_type *valid = reinterpret_cast<cudf::bitmask_type *>(j_valid);
-    if (valid == nullptr) {
-      j_null_count = 0;
-    }
+    void* data                = reinterpret_cast<void*>(j_data);
+    cudf::bitmask_type* valid = reinterpret_cast<cudf::bitmask_type*>(j_valid);
+    if (valid == nullptr) { j_null_count = 0; }
 
-    if (j_null_count < 0) { // Check for unknown null count.
+    if (j_null_count < 0) {  // Check for unknown null count.
       // Calculate concrete null count.
       j_null_count = cudf::null_count(valid, 0, size);
     }
@@ -1974,37 +2140,51 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
     if (n_type == cudf::type_id::STRING) {
       if (size == 0) {
         return ptr_as_jlong(
-            new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
+          new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
       } else {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
-        cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
-        cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets,
-                                         nullptr, 0);
-        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
-                                                  data, valid, j_null_count, 0, {offsets_column}));
+        cudf::size_type* offsets = reinterpret_cast<cudf::size_type*>(j_offset);
+        cudf::column_view offsets_column(
+          cudf::data_type{cudf::type_id::INT32}, size + 1, offsets, nullptr, 0);
+        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                                  size,
+                                                  data,
+                                                  valid,
+                                                  j_null_count,
+                                                  0,
+                                                  {offsets_column}));
       }
     } else if (n_type == cudf::type_id::LIST) {
       JNI_NULL_CHECK(env, j_children, "children of a list are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       JNI_ARG_CHECK(env, (children.size() == 1), "LIST children size is not 1", 0);
       cudf::size_type offsets_size = 0;
-      cudf::size_type *offsets = nullptr;
+      cudf::size_type* offsets     = nullptr;
       if (size != 0) {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
         offsets_size = size + 1;
-        offsets = reinterpret_cast<cudf::size_type *>(j_offset);
+        offsets      = reinterpret_cast<cudf::size_type*>(j_offset);
       }
-      cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets,
-                                       nullptr, 0);
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr,
-                                                valid, j_null_count, 0,
+      cudf::column_view offsets_column(
+        cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets, nullptr, 0);
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
                                                 {offsets_column, *children[0]}));
     } else if (n_type == cudf::type_id::STRUCT) {
       JNI_NULL_CHECK(env, j_children, "children of a struct are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       std::vector<column_view> children_vector = children.get_dereferenced();
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, size,
-                                                nullptr, valid, j_null_count, 0, children_vector));
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
+                                                children_vector));
     } else {
       return ptr_as_jlong(new cudf::column_view(n_data_type, size, data, valid, j_null_count));
     }
@@ -2012,69 +2192,79 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv *env, jobject j_object,
-                                                                      jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv* env,
+                                                                      jobject j_object,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->type().id());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv *env, jclass,
-                                                                         jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return column->type().scale();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv *env, jclass,
-                                                                        jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->size());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv* env,
                                                                        jobject j_object,
-                                                                       jlong handle) {
+                                                                       jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     delete view;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
@@ -2089,17 +2279,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIE
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
-        result = view.chars_size(cudf::get_default_stream());
+        result                         = view.chars_size(cudf::get_default_stream());
       }
     } else if (column->type().id() != cudf::type_id::LIST &&
                column->type().id() != cudf::type_id::STRUCT) {
@@ -2110,14 +2302,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv* env,
                                                                            jobject j_object,
-                                                                           jlong handle) {
-
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     // Strings has children(offsets and chars) but not a nested child() we care about here.
     if (column->type().id() == cudf::type_id::STRING) {
       return 0;
@@ -2133,53 +2325,57 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong handle,
-                                                                         jint child_index) {
+                                                                         jint child_index)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    auto const is_list = column->type().id() == cudf::type_id::LIST;
-    auto const child = column->child(child_index + (is_list ? 1 : 0));
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    auto const is_list        = column->type().id() == cudf::type_id::LIST;
+    auto const child          = column->child(child_index + (is_list ? 1 : 0));
     return ptr_as_jlong(new cudf::column_view(child));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv* env,
                                                                               jobject j_object,
-                                                                              jlong handle) {
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::lists_column_view view = cudf::lists_column_view(*column);
+    cudf::column_view* column      = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::lists_column_view view   = cudf::lists_column_view(*column);
     cudf::column_view offsets_view = view.offsets();
     return ptr_as_jlong(new cudf::column_view(offsets_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     }
     return result;
@@ -2187,24 +2383,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv *env, jclass,
-                                                                              jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     }
     return result;
@@ -2212,24 +2410,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JN
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv *env, jclass,
-                                                                                jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(column->null_mask());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    jlong result = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    jlong result              = 0;
     if (column->null_mask() != nullptr) {
       result = cudf::bitmask_allocation_size_bytes(column->size());
     }
@@ -2238,28 +2440,33 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong handle,
-                                                                           jboolean pad_for_cpu) {
+                                                                           jboolean pad_for_cpu)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view = reinterpret_cast<cudf::column_view const *>(handle);
+    auto view = reinterpret_cast<cudf::column_view const*>(handle);
     return calc_device_memory_size(*view, pad_for_cpu);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv* env, jclass)
+{
   return sizeof(std::max_align_t);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobject j_object,
-                                                               jlong handle, jlong j_lo_scalar,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle,
+                                                               jlong j_lo_scalar,
                                                                jlong j_lo_replace_scalar,
                                                                jlong j_hi_scalar,
-                                                               jlong j_hi_replace_scalar) {
-
+                                                               jlong j_hi_replace_scalar)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, j_lo_scalar, "lo scalar is null", 0)
   JNI_NULL_CHECK(env, j_lo_replace_scalar, "lo scalar replace value is null", 0)
@@ -2268,96 +2475,103 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobj
   using cudf::clamp;
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::scalar *lo_scalar = reinterpret_cast<cudf::scalar *>(j_lo_scalar);
-    cudf::scalar *lo_replace_scalar = reinterpret_cast<cudf::scalar *>(j_lo_replace_scalar);
-    cudf::scalar *hi_scalar = reinterpret_cast<cudf::scalar *>(j_hi_scalar);
-    cudf::scalar *hi_replace_scalar = reinterpret_cast<cudf::scalar *>(j_hi_replace_scalar);
+    cudf::column_view* column_view  = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::scalar* lo_scalar         = reinterpret_cast<cudf::scalar*>(j_lo_scalar);
+    cudf::scalar* lo_replace_scalar = reinterpret_cast<cudf::scalar*>(j_lo_replace_scalar);
+    cudf::scalar* hi_scalar         = reinterpret_cast<cudf::scalar*>(j_hi_scalar);
+    cudf::scalar* hi_replace_scalar = reinterpret_cast<cudf::scalar*>(j_hi_replace_scalar);
 
     return release_as_jlong(
-        clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
+      clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv *env, jobject j_object,
-                                                             jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::title(*view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong strs_handle,
-                                                                  jlong delimiters_handle) {
-
+                                                                  jlong delimiters_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, delimiters_handle, "delimiters scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *deli = reinterpret_cast<cudf::string_scalar *>(delimiters_handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* deli = reinterpret_cast<cudf::string_scalar*>(delimiters_handle);
     return release_as_jlong(cudf::strings::capitalize(*view, *deli));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(JNIEnv *env, jobject j_object,
-                                                                   jlong strs_handle,
-                                                                   jlong separator_handle,
-                                                                   jlong narep_handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(
+  JNIEnv* env, jobject j_object, jlong strs_handle, jlong separator_handle, jlong narep_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, separator_handle, "separator scalar handle is null", 0)
   JNI_NULL_CHECK(env, narep_handle, "narep scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *sep = reinterpret_cast<cudf::string_scalar *>(separator_handle);
-    cudf::string_scalar *narep = reinterpret_cast<cudf::string_scalar *>(narep_handle);
+    cudf::column_view* view    = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* sep   = reinterpret_cast<cudf::string_scalar*>(separator_handle);
+    cudf::string_scalar* narep = reinterpret_cast<cudf::string_scalar*>(narep_handle);
     return release_as_jlong(cudf::strings::join_strings(*view, *sep, *narep));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlongArray handles,
-                                                                      jlong row_count) {
-
+                                                                      jlong row_count)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
     auto children_vector = children.get_dereferenced();
-    return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, row_count,
-                                              nullptr, nullptr, 0, 0, children_vector));
+    return ptr_as_jlong(new cudf::column_view(
+      cudf::data_type{cudf::type_id::STRUCT}, row_count, nullptr, nullptr, 0, 0, children_vector));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env, jobject j_object,
-                                                                   jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv* env,
+                                                                   jobject j_object,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = *reinterpret_cast<cudf::column_view *>(handle);
+    auto const input = *reinterpret_cast<cudf::column_view*>(handle);
     // get a new null mask by setting all the nans to null
     auto [new_nullmask, new_null_count] = cudf::nans_to_nulls(input);
     // create a column_view which is a no-copy wrapper around the original column without the null
     // mask
-    auto const input_without_nullmask = cudf::column_view(
-        input.type(), input.size(), input.head<void>(), nullptr, 0, input.offset(),
-        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
+    auto const input_without_nullmask =
+      cudf::column_view(input.type(),
+                        input.size(),
+                        input.head<void>(),
+                        nullptr,
+                        0,
+                        input.offset(),
+                        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
     // create a column by deep copying `input_without_nullmask`.
     auto deep_copy = std::make_unique<cudf::column>(input_without_nullmask);
     deep_copy->set_null_mask(std::move(*new_nullmask), new_null_count);
@@ -2366,99 +2580,106 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env,
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv *env, jobject j_object,
-                                                               jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_float(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jobject j_object,
-                                                                 jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv* env,
+                                                                 jobject j_object,
+                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_integer(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject,
-                                                                    jlong handle, jint j_dtype,
-                                                                    jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view  = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_fixed_point(*view, fp_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
-                                                                         jlong handle, jint j_dtype,
-                                                                         jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_integer(*view, int_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv* env,
                                                                           jobject j_object,
-                                                                          jlong handle) {
-
+                                                                          jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(new cudf::column(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
-    JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle, jboolean allow_single_quotes,
-    jboolean strip_quotes_from_single_strings, jboolean missing_fields_as_nulls) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv* env,
+                                             jclass,
+                                             jlong j_view_handle,
+                                             jlong j_scalar_handle,
+                                             jboolean allow_single_quotes,
+                                             jboolean strip_quotes_from_single_strings,
+                                             jboolean missing_fields_as_nulls)
+{
   JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
   JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
-    cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-    auto options = cudf::get_json_object_options{};
+    cudf::string_scalar* n_scalar_path = reinterpret_cast<cudf::string_scalar*>(j_scalar_handle);
+    auto options                       = cudf::get_json_object_options{};
     options.set_allow_single_quotes(allow_single_quotes);
     options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
     options.set_missing_fields_as_nulls(missing_fields_as_nulls);
     auto result_col_ptr = [&]() {
       try {
         return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options);
-      } catch (std::invalid_argument const &err) {
+      } catch (std::invalid_argument const& err) {
         auto const null_scalar = cudf::string_scalar(std::string(""), false);
         return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size());
-      } catch (...) { throw; }
+      } catch (...) {
+        throw;
+      }
     }();
     return release_as_jlong(result_col_ptr);
   }
@@ -2466,64 +2687,82 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElementsSepCol(
-    JNIEnv *env, jclass, jlong column_handle, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls, jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong sep_handle,
+  jlong separator_narep,
+  jlong col_narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output             = empty_string_output_if_empty_list
+                                           ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                           : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(
-        cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar,
-                                          col_narep_scalar, null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(lcv,
+                                                              strings_column,
+                                                              separator_narep_scalar,
+                                                              col_narep_scalar,
+                                                              null_policy,
+                                                              empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElements(
-    JNIEnv *env, jclass, jlong column_handle, jlong separator, jlong narep, jboolean separate_nulls,
-    jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong separator,
+  jlong narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output       = empty_string_output_if_empty_list
+                                     ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                     : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar,
-                                                              null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(
+      lcv, separator_scalar, narep_scalar, null_policy, empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv* env,
+                                                                     jclass,
                                                                      jlong strings_handle,
-                                                                     jint repeat_times) {
+                                                                     jint repeat_times)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
+    auto const cv       = *reinterpret_cast<cudf::column_view*>(strings_handle);
     auto const strs_col = cudf::strings_column_view(cv);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times));
   }
@@ -2531,69 +2770,76 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStringsWithColumnRepeatTimes(
-    JNIEnv *env, jclass, jlong strings_handle, jlong repeat_times_handle) {
+  JNIEnv* env, jclass, jlong strings_handle, jlong repeat_times_handle)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   JNI_NULL_CHECK(env, repeat_times_handle, "repeat_times_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const strings_cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
-    auto const strs_col = cudf::strings_column_view(strings_cv);
-    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view *>(repeat_times_handle);
+    auto const strings_cv      = *reinterpret_cast<cudf::column_view*>(strings_handle);
+    auto const strs_col        = cudf::strings_column_view(strings_cv);
+    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view*>(repeat_times_handle);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times_cv));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_applyBooleanMask(
-    JNIEnv *env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle) {
+  JNIEnv* env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle)
+{
   JNI_NULL_CHECK(env, list_column_handle, "list handle is null", 0);
   JNI_NULL_CHECK(env, boolean_mask_list_column_handle, "boolean mask handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view const *list_column =
-        reinterpret_cast<cudf::column_view const *>(list_column_handle);
+    cudf::column_view const* list_column =
+      reinterpret_cast<cudf::column_view const*>(list_column_handle);
     cudf::lists_column_view const list_view = cudf::lists_column_view(*list_column);
 
-    cudf::column_view const *boolean_mask_list_column =
-        reinterpret_cast<cudf::column_view const *>(boolean_mask_list_column_handle);
+    cudf::column_view const* boolean_mask_list_column =
+      reinterpret_cast<cudf::column_view const*>(boolean_mask_list_column_handle);
     cudf::lists_column_view const boolean_mask_list_view =
-        cudf::lists_column_view(*boolean_mask_list_column);
+      cudf::lists_column_view(*boolean_mask_list_column);
 
     return release_as_jlong(cudf::lists::apply_boolean_mask(list_view, boolean_mask_list_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL
-Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return cudf::has_nonempty_nulls(*cv);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return release_as_jlong(cudf::purge_nonempty_nulls(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::strings::integers_to_hex(*input));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 56aea0b45e2..2dbff923544 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <vector>
+#include "ColumnViewJni.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -29,59 +29,64 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/functional.h>
 #include <thrust/logical.h>
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 
-#include "ColumnViewJni.hpp"
+#include <vector>
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &validity_column) {
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& validity_column)
+{
   CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
                "Validity column must be of type bool");
   CUDF_EXPECTS(validity_column.size() == exemplar.size(),
                "Exemplar and validity columns must have the same size");
 
   auto validity_device_view = cudf::column_device_view::create(validity_column);
-  auto validity_begin = cudf::detail::make_optional_iterator<bool>(
-      *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
-  auto validity_end = validity_begin + validity_device_view->size();
+  auto validity_begin       = cudf::detail::make_optional_iterator<bool>(
+    *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
+  auto validity_end            = validity_begin + validity_device_view->size();
   auto [null_mask, null_count] = cudf::detail::valid_if(
-      validity_begin, validity_end,
-      [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  auto const exemplar_without_null_mask = cudf::column_view{
-      exemplar.type(),
-      exemplar.size(),
-      exemplar.head<void>(),
-      nullptr,
-      0,
-      exemplar.offset(),
-      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
+    validity_begin,
+    validity_end,
+    [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  auto const exemplar_without_null_mask =
+    cudf::column_view{exemplar.type(),
+                      exemplar.size(),
+                      exemplar.head<void>(),
+                      nullptr,
+                      0,
+                      exemplar.offset(),
+                      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
   auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
   deep_copy->set_null_mask(std::move(null_mask), null_count);
   return deep_copy;
 }
 
-std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &list_length,
-                                                    rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const& list_length,
+                                                    rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(list_length.type().id() == cudf::type_id::INT32,
                "Input column does not have type INT32.");
 
   auto const begin_iter = list_length.template begin<cudf::size_type>();
-  auto const end_iter = list_length.template end<cudf::size_type>();
+  auto const end_iter   = list_length.template end<cudf::size_type>();
 
-  auto offsets_column = make_numeric_column(data_type{type_id::INT32}, list_length.size() + 1,
-                                            mask_state::UNALLOCATED, stream);
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, list_length.size() + 1, mask_state::UNALLOCATED, stream);
   auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets = offsets_view.template begin<int32_t>();
+  auto d_offsets    = offsets_view.template begin<int32_t>();
 
   thrust::inclusive_scan(rmm::exec_policy(stream), begin_iter, end_iter, d_offsets + 1);
   CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream));
@@ -97,75 +102,82 @@ namespace {
  * @param list The input list.
  * @return The boolean result indicating if the input list has null elements.
  */
-__device__ bool list_has_nulls(list_device_view list) {
-  return thrust::any_of(thrust::seq, thrust::make_counting_iterator(0),
+__device__ bool list_has_nulls(list_device_view list)
+{
+  return thrust::any_of(thrust::seq,
+                        thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(list.size()),
                         [&list](auto const idx) { return list.is_null(idx); });
 }
 
-} // namespace
+}  // namespace
 
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
-                               rmm::cuda_stream_view stream) {
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
+                               rmm::cuda_stream_view stream)
+{
   // If both of the input columns do not have nulls, we don't need to do anything here.
   if (!lists_column_view{lhs}.child().has_nulls() && !lists_column_view{rhs}.child().has_nulls()) {
     return;
   }
 
-  auto const overlap_cv = overlap_result->view();
-  auto const lhs_cdv_ptr = column_device_view::create(lhs, stream);
-  auto const rhs_cdv_ptr = column_device_view::create(rhs, stream);
+  auto const overlap_cv      = overlap_result->view();
+  auto const lhs_cdv_ptr     = column_device_view::create(lhs, stream);
+  auto const rhs_cdv_ptr     = column_device_view::create(rhs, stream);
   auto const overlap_cdv_ptr = column_device_view::create(overlap_cv, stream);
 
   // Create a new bitmask to satisfy Spark's arrays_overlap's special behavior.
   auto validity = rmm::device_uvector<bool>(overlap_cv.size(), stream);
-  thrust::tabulate(rmm::exec_policy(stream), validity.begin(), validity.end(),
-                   [lhs = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
-                    rhs = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
-                    overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
-                     if (overlap_result.is_null(idx) ||
-                         overlap_result.template element<bool>(idx)) {
-                       return true;
-                     }
-
-                     // `lhs_list` and `rhs_list` should not be null, otherwise
-                     // `overlap_result[idx]` is null and that has been handled above.
-                     auto const lhs_list = list_device_view{lhs, idx};
-                     auto const rhs_list = list_device_view{rhs, idx};
-
-                     // Only proceed if both lists are non-empty.
-                     if (lhs_list.size() == 0 || rhs_list.size() == 0) {
-                       return true;
-                     }
-
-                     // Only proceed if at least one list has nulls.
-                     if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) {
-                       return true;
-                     }
-
-                     // Here, the input lists satisfy all the conditions below so we output a
-                     // null:
-                     //  - Both of the input lists have no non-null common element, and
-                     //  - They are both non-empty, and
-                     //  - Either of them contains null elements.
-                     return false;
-                   });
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    validity.begin(),
+    validity.end(),
+    [lhs            = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
+     rhs            = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
+     overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
+      if (overlap_result.is_null(idx) || overlap_result.template element<bool>(idx)) {
+        return true;
+      }
+
+      // `lhs_list` and `rhs_list` should not be null, otherwise
+      // `overlap_result[idx]` is null and that has been handled above.
+      auto const lhs_list = list_device_view{lhs, idx};
+      auto const rhs_list = list_device_view{rhs, idx};
+
+      // Only proceed if both lists are non-empty.
+      if (lhs_list.size() == 0 || rhs_list.size() == 0) { return true; }
+
+      // Only proceed if at least one list has nulls.
+      if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) { return true; }
+
+      // Here, the input lists satisfy all the conditions below so we output a
+      // null:
+      //  - Both of the input lists have no non-null common element, and
+      //  - They are both non-empty, and
+      //  - Either of them contains null elements.
+      return false;
+    });
 
   // Create a new nullmask from the validity data.
   auto [new_null_mask, new_null_count] =
-      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
-                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::detail::valid_if(validity.begin(),
+                           validity.end(),
+                           thrust::identity{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
     // new nullmask.
     if (overlap_cv.nullable()) {
       auto [null_mask, null_count] = cudf::detail::bitmask_and(
-          std::vector<bitmask_type const *>{
-              overlap_cv.null_mask(), static_cast<bitmask_type const *>(new_null_mask.data())},
-          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream,
-          rmm::mr::get_current_device_resource());
+        std::vector<bitmask_type const*>{overlap_cv.null_mask(),
+                                         static_cast<bitmask_type const*>(new_null_mask.data())},
+        std::vector<cudf::size_type>{0, 0},
+        overlap_cv.size(),
+        stream,
+        rmm::mr::get_current_device_resource());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -174,30 +186,32 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
   }
 }
 
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
-                                                    rmm::cuda_stream_view stream) {
-  if (input.is_empty()) {
-    return empty_like(input.parent());
-  }
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
+                                                    rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return empty_like(input.parent()); }
 
   auto const child = input.get_sliced_child(stream);
 
   // Generate labels for the input list elements.
   auto labels = rmm::device_uvector<cudf::size_type>(child.size(), stream);
-  cudf::detail::label_segments(input.offsets_begin(), input.offsets_end(), labels.begin(),
-                               labels.end(), stream);
+  cudf::detail::label_segments(
+    input.offsets_begin(), input.offsets_end(), labels.begin(), labels.end(), stream);
 
   // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
   // being called in `create_map` in spark-rapids.
   // Other options comparing nulls and NaNs are set as all-equal.
-  auto out_columns =
-      cudf::detail::stable_distinct(
-          table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}}, child.child(0),
-                      child.child(1)}}, // input table
-          std::vector<size_type>{0, 1}, // key columns
-          cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
-          cudf::nan_equality::ALL_EQUAL, stream, rmm::mr::get_current_device_resource())
-          ->release();
+  auto out_columns = cudf::detail::stable_distinct(
+                       table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
+                                   child.child(0),
+                                   child.child(1)}},  // input table
+                       std::vector<size_type>{0, 1},  // key columns
+                       cudf::duplicate_keep_option::KEEP_LAST,
+                       cudf::null_equality::EQUAL,
+                       cudf::nan_equality::ALL_EQUAL,
+                       stream,
+                       rmm::mr::get_current_device_resource())
+                       ->release();
   auto const out_labels = out_columns.front()->view();
 
   // Assemble a structs column of <out_keys, out_vals>.
@@ -205,20 +219,26 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   out_structs_members.emplace_back(std::move(out_columns[1]));
   out_structs_members.emplace_back(std::move(out_columns[2]));
   auto out_structs =
-      cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
+    cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
 
   // Assemble a lists column of structs<out_keys, out_vals>.
-  auto out_offsets = make_numeric_column(data_type{type_to_id<size_type>()}, input.size() + 1,
-                                         mask_state::UNALLOCATED, stream);
+  auto out_offsets = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream);
   auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
-  auto const labels_begin = out_labels.template begin<size_type>();
-  cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
-                                  offsets_begin + out_offsets->size(), stream);
+  auto const labels_begin  = out_labels.template begin<size_type>();
+  cudf::detail::labels_to_offsets(labels_begin,
+                                  labels_begin + out_labels.size(),
+                                  offsets_begin,
+                                  offsets_begin + out_offsets->size(),
+                                  stream);
 
   return cudf::make_lists_column(
-      input.size(), std::move(out_offsets), std::move(out_structs), input.null_count(),
-      cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
-      stream);
+    input.size(),
+    std::move(out_offsets),
+    std::move(out_structs),
+    input.null_count(),
+    cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+    stream);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 12061119402..c9eef0139ea 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::jni {
@@ -34,9 +35,8 @@ namespace cudf::jni {
  * @param bool_column bool column whose value is to be used as the validity.
  * @return Deep copy of the exemplar, with the replaced validity.
  */
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &bool_column);
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& bool_column);
 
 /**
  * @brief Generates list offsets with lengths of each list.
@@ -49,9 +49,8 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
  * @param list_length The column represents list lengths.
  * @return The column represents list offsets.
  */
-std::unique_ptr<cudf::column>
-generate_list_offsets(cudf::column_view const &list_length,
-                      rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> generate_list_offsets(
+  cudf::column_view const& list_length, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Perform a special treatment for the results of `cudf::lists::have_overlap` to produce the
@@ -71,8 +70,9 @@ generate_list_offsets(cudf::column_view const &list_length,
  * @param rhs The input lists column for the other side.
  * @param overlap_result The result column generated by checking list overlap in cudf.
  */
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
                                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
@@ -88,7 +88,7 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
  *
  * @return A new list columns in which the elements in each list are distinct by key.
  */
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
                                                     rmm::cuda_stream_view stream);
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 56c96b26200..339204b96e6 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cstdint>
-#include <memory>
-#include <stdexcept>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_compiled_expr.hpp"
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -25,56 +23,65 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_compiled_expr.hpp"
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <vector>
 
 namespace {
 
 /** Utility class to read data from the serialized AST buffer generated from Java */
 class jni_serialized_ast {
-  jbyte const *data_ptr;      // pointer to the current entity to deserialize
-  jbyte const *const end_ptr; // pointer to the byte immediately after the AST serialized data
+  jbyte const* data_ptr;       // pointer to the current entity to deserialize
+  jbyte const* const end_ptr;  // pointer to the byte immediately after the AST serialized data
 
   /** Throws an error if there is insufficient space left to read the specified number of bytes */
-  void check_for_eof(std::size_t num_bytes_to_read) {
+  void check_for_eof(std::size_t num_bytes_to_read)
+  {
     if (data_ptr + num_bytes_to_read > end_ptr) {
       throw std::runtime_error("Unexpected end of serialized data");
     }
   }
 
-public:
-  jni_serialized_ast(cudf::jni::native_jbyteArray &jni_data)
-      : data_ptr(jni_data.begin()), end_ptr(jni_data.end()) {}
+ public:
+  jni_serialized_ast(cudf::jni::native_jbyteArray& jni_data)
+    : data_ptr(jni_data.begin()), end_ptr(jni_data.end())
+  {
+  }
 
   /** Returns true if there is no data remaining to be read */
   bool at_eof() { return data_ptr == end_ptr; }
 
   /** Read a byte from the serialized AST data buffer */
-  jbyte read_byte() {
+  jbyte read_byte()
+  {
     check_for_eof(sizeof(jbyte));
     return *data_ptr++;
   }
 
   /** Read a multi-byte value from the serialized AST data buffer */
-  template <typename T> T read() {
+  template <typename T>
+  T read()
+  {
     if constexpr (std::is_same_v<T, std::string>) {
       auto const size = read<cudf::size_type>();
       check_for_eof(size);
-      auto const result = std::string(reinterpret_cast<char const *>(data_ptr), size);
+      auto const result = std::string(reinterpret_cast<char const*>(data_ptr), size);
       data_ptr += size;
       return result;
     } else {
       check_for_eof(sizeof(T));
       // use memcpy since data may be misaligned
       T result;
-      memcpy(reinterpret_cast<jbyte *>(&result), data_ptr, sizeof(T));
+      memcpy(reinterpret_cast<jbyte*>(&result), data_ptr, sizeof(T));
       data_ptr += sizeof(T);
       return result;
     }
   }
 
   /** Decode a libcudf data type from the serialized AST data buffer */
-  cudf::data_type read_cudf_type() {
+  cudf::data_type read_cudf_type()
+  {
     auto const dtype_id = static_cast<cudf::type_id>(read_byte());
     switch (dtype_id) {
       case cudf::type_id::INT8:
@@ -116,10 +123,10 @@ class jni_serialized_ast {
  * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java!
  */
 enum class jni_serialized_expression_type : int8_t {
-  VALID_LITERAL = 0,
-  NULL_LITERAL = 1,
+  VALID_LITERAL    = 0,
+  NULL_LITERAL     = 1,
   COLUMN_REFERENCE = 2,
-  UNARY_OPERATION = 3,
+  UNARY_OPERATION  = 3,
   BINARY_OPERATION = 4
 };
 
@@ -128,7 +135,8 @@ enum class jni_serialized_expression_type : int8_t {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in UnaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::IDENTITY;
     case 1: return cudf::ast::ast_operator::IS_NULL;
@@ -166,7 +174,8 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in BinaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::ADD;
     case 1: return cudf::ast::ast_operator::SUB;
@@ -200,7 +209,8 @@ cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST table reference.
  * NOTE: This must be kept in sync with the enumeration in TableReference.java!
  */
-cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
+cudf::ast::table_reference jni_to_table_reference(jbyte jni_value)
+{
   switch (jni_value) {
     case 0: return cudf::ast::table_reference::LEFT;
     case 1: return cudf::ast::table_reference::RIGHT;
@@ -211,64 +221,72 @@ cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
 /** Functor for type-dispatching the creation of an AST literal */
 struct make_literal {
   /** Construct an AST literal from a numeric value */
-  template <typename T, std::enable_if_t<cudf::is_numeric<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_numeric_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &numeric_scalar = static_cast<cudf::numeric_scalar<T> &>(*scalar_ptr);
+    auto& numeric_scalar = static_cast<cudf::numeric_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(numeric_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a timestamp value */
-  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_timestamp_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &timestamp_scalar = static_cast<cudf::timestamp_scalar<T> &>(*scalar_ptr);
+    auto& timestamp_scalar = static_cast<cudf::timestamp_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(timestamp_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a duration value */
-  template <typename T, std::enable_if_t<cudf::is_duration<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_duration_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &duration_scalar = static_cast<cudf::duration_scalar<T> &>(*scalar_ptr);
+    auto& duration_scalar = static_cast<cudf::duration_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(duration_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a string value */
-  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = [&]() {
       if (is_valid) {
         std::string val = jni_ast.read<std::string>();
@@ -278,64 +296,73 @@ struct make_literal {
       }
     }();
 
-    auto &str_scalar = static_cast<cudf::string_scalar &>(*scalar_ptr);
+    auto& str_scalar = static_cast<cudf::string_scalar&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(str_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Default functor implementation to catch type dispatch errors */
-  template <typename T, std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
-                                         !cudf::is_duration<T>() &&
-                                         !std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <
+    typename T,
+    std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
+                     !cudf::is_duration<T>() && !std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     throw std::logic_error("Unsupported AST literal type");
   }
 };
 
 /** Decode a serialized AST literal */
-cudf::ast::literal &compile_literal(bool is_valid, cudf::jni::ast::compiled_expr &compiled_expr,
-                                    jni_serialized_ast &jni_ast) {
+cudf::ast::literal& compile_literal(bool is_valid,
+                                    cudf::jni::ast::compiled_expr& compiled_expr,
+                                    jni_serialized_ast& jni_ast)
+{
   auto const dtype = jni_ast.read_cudf_type();
   return cudf::type_dispatcher(dtype, make_literal{}, dtype, is_valid, compiled_expr, jni_ast);
 }
 
 /** Decode a serialized AST column reference */
-cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                      jni_serialized_ast &jni_ast) {
-  auto const table_ref = jni_to_table_reference(jni_ast.read_byte());
+cudf::ast::column_reference& compile_column_reference(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                      jni_serialized_ast& jni_ast)
+{
+  auto const table_ref               = jni_to_table_reference(jni_ast.read_byte());
   cudf::size_type const column_index = jni_ast.read<int>();
   return compiled_expr.add_column_ref(
-      std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
+    std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
 }
 
 // forward declaration
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast);
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast);
 
 /** Decode a serialized AST unary expression */
-cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                               jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_unary_operator(jni_ast.read_byte());
-  cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_unary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                               jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                       = jni_to_unary_operator(jni_ast.read_byte());
+  cudf::ast::expression& child_expression = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, child_expression));
+    std::make_unique<cudf::ast::operation>(ast_op, child_expression));
 }
 
 /** Decode a serialized AST binary expression */
-cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_binary_operator(jni_ast.read_byte());
-  cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast);
-  cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_binary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                  = jni_to_binary_operator(jni_ast.read_byte());
+  cudf::ast::expression& left_child  = compile_expression(compiled_expr, jni_ast);
+  cudf::ast::expression& right_child = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
+    std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
 }
 
 /** Decode a serialized AST expression by reading the expression type and dispatching */
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast) {
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast)
+{
   auto const expression_type = static_cast<jni_serialized_expression_type>(jni_ast.read_byte());
   switch (expression_type) {
     case jni_serialized_expression_type::VALID_LITERAL:
@@ -353,23 +380,24 @@ cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compile
 }
 
 /** Decode a serialized AST into a native libcudf AST and associated resources */
-std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast &jni_ast) {
+std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast& jni_ast)
+{
   auto jni_expr_ptr = std::make_unique<cudf::jni::ast::compiled_expr>();
   (void)compile_expression(*jni_expr_ptr, jni_ast);
 
-  if (!jni_ast.at_eof()) {
-    throw std::invalid_argument("Extra bytes at end of serialized AST");
-  }
+  if (!jni_ast.at_eof()) { throw std::invalid_argument("Extra bytes at end of serialized AST"); }
 
   return jni_expr_ptr;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv *env, jclass,
-                                                                           jbyteArray jni_data) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv* env,
+                                                                           jclass,
+                                                                           jbyteArray jni_data)
+{
   JNI_NULL_CHECK(env, jni_data, "Serialized AST data is null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -382,31 +410,34 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong j_ast,
-                                                                                 jlong j_table) {
+                                                                                 jlong j_table)
+{
   JNI_NULL_CHECK(env, j_ast, "Compiled AST pointer is null", 0);
   JNI_NULL_CHECK(env, j_table, "Table view pointer is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_ast);
-    auto tview_ptr = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_ast);
+    auto tview_ptr         = reinterpret_cast<cudf::table_view const*>(j_table);
     std::unique_ptr<cudf::column> result =
-        cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
+      cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv *env, jclass,
-                                                                          jlong jni_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong jni_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr *>(jni_handle);
+    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr*>(jni_handle);
     delete ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ContiguousTableJni.cpp b/java/src/main/native/src/ContiguousTableJni.cpp
index 8c99c77ca1f..19a10bf25ec 100644
--- a/java/src/main/native/src/ContiguousTableJni.cpp
+++ b/java/src/main/native/src/ContiguousTableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace {
 
-#define CONTIGUOUS_TABLE_CLASS "ai/rapids/cudf/ContiguousTable"
+#define CONTIGUOUS_TABLE_CLASS                  "ai/rapids/cudf/ContiguousTable"
 #define CONTIGUOUS_TABLE_FACTORY_SIG(param_sig) "(" param_sig ")L" CONTIGUOUS_TABLE_CLASS ";"
 
 jclass Contiguous_table_jclass;
@@ -29,87 +29,85 @@ jclass Contig_split_group_by_result_jclass;
 jfieldID Contig_split_group_by_result_groups_field;
 jfieldID Contig_split_group_by_result_uniq_key_columns_field;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
-bool cache_contiguous_table_jni(JNIEnv *env) {
+bool cache_contiguous_table_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(CONTIGUOUS_TABLE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   From_packed_table_method =
-      env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
-  if (From_packed_table_method == nullptr) {
-    return false;
-  }
+    env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
+  if (From_packed_table_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contiguous_table_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contiguous_table_jclass == nullptr) {
-    return false;
-  }
+  if (Contiguous_table_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contiguous_table_jni(JNIEnv *env) {
+void release_contiguous_table_jni(JNIEnv* env)
+{
   Contiguous_table_jclass = cudf::jni::del_global_ref(env, Contiguous_table_jclass);
 }
 
-bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
+bool cache_contig_split_group_by_result_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(GROUP_BY_RESULT_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Contig_split_group_by_result_groups_field =
-      env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
-  if (Contig_split_group_by_result_groups_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
+  if (Contig_split_group_by_result_groups_field == nullptr) { return false; }
   Contig_split_group_by_result_uniq_key_columns_field =
-      env->GetFieldID(cls, "uniqKeyColumns", "[J");
-  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "uniqKeyColumns", "[J");
+  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contig_split_group_by_result_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contig_split_group_by_result_jclass == nullptr) {
-    return false;
-  }
+  if (Contig_split_group_by_result_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contig_split_group_by_result_jni(JNIEnv *env) {
+void release_contig_split_group_by_result_jni(JNIEnv* env)
+{
   Contig_split_group_by_result_jclass = del_global_ref(env, Contig_split_group_by_result_jclass);
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups) {
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   return gbr;
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns) {
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   env->SetObjectField(gbr, Contig_split_group_by_result_uniq_key_columns_field, uniq_key_columns);
   return gbr;
 }
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count) {
-  jlong metadata_address = reinterpret_cast<jlong>(split.metadata.get());
-  jlong data_address = reinterpret_cast<jlong>(split.gpu_data->data());
-  jlong data_size = static_cast<jlong>(split.gpu_data->size());
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count)
+{
+  jlong metadata_address   = reinterpret_cast<jlong>(split.metadata.get());
+  jlong data_address       = reinterpret_cast<jlong>(split.gpu_data->data());
+  jlong data_size          = static_cast<jlong>(split.gpu_data->size());
   jlong rmm_buffer_address = reinterpret_cast<jlong>(split.gpu_data.get());
 
-  jobject contig_table_obj = env->CallStaticObjectMethod(
-      Contiguous_table_jclass, From_packed_table_method, metadata_address, data_address, data_size,
-      rmm_buffer_address, row_count);
+  jobject contig_table_obj = env->CallStaticObjectMethod(Contiguous_table_jclass,
+                                                         From_packed_table_method,
+                                                         metadata_address,
+                                                         data_address,
+                                                         data_size,
+                                                         rmm_buffer_address,
+                                                         row_count);
 
   if (contig_table_obj != nullptr) {
     split.metadata.release();
@@ -119,28 +117,30 @@ jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row
   return contig_table_obj;
 }
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length) {
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length)
+{
   return native_jobjectArray<jobject>(
-      env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
+    env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ContiguousTable_createPackedMetadata(
-    JNIEnv *env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length) {
+  JNIEnv* env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length)
+{
   JNI_NULL_CHECK(env, j_table, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto table = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto data_addr = reinterpret_cast<uint8_t const *>(j_buffer_addr);
-    auto data_size = static_cast<size_t>(j_buffer_length);
+    auto table        = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto data_addr    = reinterpret_cast<uint8_t const*>(j_buffer_addr);
+    auto data_size    = static_cast<size_t>(j_buffer_length);
     auto metadata_ptr = new std::vector<uint8_t>(cudf::pack_metadata(*table, data_addr, data_size));
     return reinterpret_cast<jlong>(metadata_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index ef165281bf9..382d0e6c9f7 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cstring>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
 
 #include <cufile.h>
 #include <fcntl.h>
-#include <unistd.h>
-
-#include <cudf/utilities/error.hpp>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <unistd.h>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
+#include <cstring>
 
 namespace {
 
@@ -34,10 +34,10 @@ namespace {
  * @param cu_result CUDA driver error code.
  * @return Description for the error.
  */
-char const *GetCuErrorString(CUresult cu_result) {
-  char const *description;
-  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS)
-    description = "unknown cuda error";
+char const* GetCuErrorString(CUresult cu_result)
+{
+  char const* description;
+  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS) description = "unknown cuda error";
   return description;
 }
 
@@ -49,9 +49,10 @@ char const *GetCuErrorString(CUresult cu_result) {
  * @param error_code Integer error code.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(int error_code) {
-  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code)) :
-                                     std::string(std::strerror(error_code));
+std::string cuFileGetErrorString(int error_code)
+{
+  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code))
+                                   : std::string(std::strerror(error_code));
 }
 
 /**
@@ -60,11 +61,10 @@ std::string cuFileGetErrorString(int error_code) {
  * @param status cuFile return status.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(CUfileError_t status) {
+std::string cuFileGetErrorString(CUfileError_t status)
+{
   std::string error = cuFileGetErrorString(status.err);
-  if (IS_CUDA_ERR(status)) {
-    error.append(".").append(GetCuErrorString(status.cu_err));
-  }
+  if (IS_CUDA_ERR(status)) { error.append(".").append(GetCuErrorString(status.cu_err)); }
   return error;
 }
 
@@ -72,9 +72,10 @@ std::string cuFileGetErrorString(CUfileError_t status) {
  * @brief RAII wrapper for the cuFile driver.
  */
 class cufile_driver {
-public:
+ public:
   /** @brief Construct a new driver instance by opening the cuFile driver. */
-  cufile_driver() {
+  cufile_driver()
+  {
     auto const status = cuFileDriverOpen();
     if (status.err != CU_FILE_SUCCESS) {
       CUDF_FAIL("Failed to initialize cuFile driver: " + cuFileGetErrorString(status));
@@ -82,8 +83,8 @@ class cufile_driver {
   }
 
   // Disable copy (and move) semantics.
-  cufile_driver(cufile_driver const &) = delete;
-  cufile_driver &operator=(cufile_driver const &) = delete;
+  cufile_driver(cufile_driver const&)            = delete;
+  cufile_driver& operator=(cufile_driver const&) = delete;
 
   /** @brief Destroy the driver instance by closing the cuFile driver. */
   ~cufile_driver() { cuFileDriverClose(); }
@@ -91,7 +92,7 @@ class cufile_driver {
 
 /** @brief RAII wrapper for a device buffer used by cuFile. */
 class cufile_buffer {
-public:
+ public:
   /**
    * @brief Construct a new cuFile buffer.
    *
@@ -100,8 +101,9 @@ class cufile_buffer {
    * @param register_buffer Whether to register the buffer with cuFile. This should only be set to
    * true if this buffer is being reused and is 4KiB aligned.
    */
-  cufile_buffer(void *device_pointer, std::size_t size, bool register_buffer = false)
-      : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer} {
+  cufile_buffer(void* device_pointer, std::size_t size, bool register_buffer = false)
+    : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer}
+  {
     if (register_buffer_) {
       auto const status = cuFileBufRegister(device_pointer_, size_, 0);
       if (status.err != CU_FILE_SUCCESS) {
@@ -111,14 +113,13 @@ class cufile_buffer {
   }
 
   // Disable copy (and move) semantics.
-  cufile_buffer(cufile_buffer const &) = delete;
-  cufile_buffer &operator=(cufile_buffer const &) = delete;
+  cufile_buffer(cufile_buffer const&)            = delete;
+  cufile_buffer& operator=(cufile_buffer const&) = delete;
 
   /** @brief Destroy the buffer by de-registering it if necessary. */
-  ~cufile_buffer() {
-    if (register_buffer_) {
-      cuFileBufDeregister(device_pointer_);
-    }
+  ~cufile_buffer()
+  {
+    if (register_buffer_) { cuFileBufDeregister(device_pointer_); }
   }
 
   /**
@@ -126,7 +127,7 @@ class cufile_buffer {
    *
    * @return Pointer to the device buffer.
    */
-  void *device_pointer() const { return device_pointer_; }
+  void* device_pointer() const { return device_pointer_; }
 
   /**
    * @brief Get the size of the underlying device buffer.
@@ -135,9 +136,9 @@ class cufile_buffer {
    */
   std::size_t size() const { return size_; }
 
-private:
+ private:
   /// Pointer to the device buffer.
-  void *device_pointer_;
+  void* device_pointer_;
   /// Size of the device buffer.
   std::size_t size_;
   /// Whether to register the buffer with cuFile.
@@ -146,7 +147,7 @@ class cufile_buffer {
 
 /** @brief RAII wrapper for a file descriptor and the corresponding cuFile handle. */
 class cufile_file {
-public:
+ public:
   /**
    * @brief Construct a file wrapper.
    *
@@ -154,7 +155,8 @@ class cufile_file {
    *
    * @param file_descriptor A valid file descriptor.
    */
-  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor} {
+  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor}
+  {
     CUfileDescr_t cufile_descriptor{CU_FILE_HANDLE_TYPE_OPAQUE_FD, file_descriptor_};
     auto const status = cuFileHandleRegister(&cufile_handle_, &cufile_descriptor);
     if (status.err != CU_FILE_SUCCESS) {
@@ -169,7 +171,8 @@ class cufile_file {
    * @param path Absolute path of the file to read from.
    * @return std::unique_ptr<cufile_file> for reading.
    */
-  static auto make_reader(char const *path) {
+  static auto make_reader(char const* path)
+  {
     auto const file_descriptor = open(path, O_RDONLY | O_DIRECT);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to read: " + cuFileGetErrorString(errno));
@@ -183,7 +186,8 @@ class cufile_file {
    * @param path Absolute path of the file to write to.
    * @return std::unique_ptr<cufile_file> for writing.
    */
-  static auto make_writer(char const *path) {
+  static auto make_writer(char const* path)
+  {
     auto const file_descriptor = open(path, O_CREAT | O_WRONLY | O_DIRECT, S_IRUSR | S_IWUSR);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to write: " + cuFileGetErrorString(errno));
@@ -192,11 +196,12 @@ class cufile_file {
   }
 
   // Disable copy (and move) semantics.
-  cufile_file(cufile_file const &) = delete;
-  cufile_file &operator=(cufile_file const &) = delete;
+  cufile_file(cufile_file const&)            = delete;
+  cufile_file& operator=(cufile_file const&) = delete;
 
   /** @brief Destroy the file wrapper by de-registering the cuFile handle and closing the file. */
-  ~cufile_file() {
+  ~cufile_file()
+  {
     cuFileHandleDeregister(cufile_handle_);
     close(file_descriptor_);
   }
@@ -207,9 +212,10 @@ class cufile_file {
    * @param buffer Device buffer to read the file content into.
    * @param file_offset Starting offset from which to read the file.
    */
-  void read(cufile_buffer const &buffer, std::size_t file_offset) const {
+  void read(cufile_buffer const& buffer, std::size_t file_offset) const
+  {
     auto const status =
-        cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
+      cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
 
     if (status < 0) {
       if (IS_CUFILE_ERR(status)) {
@@ -230,7 +236,8 @@ class cufile_file {
    * @param size The number of bytes to write.
    * @param file_offset Starting offset from which to write the buffer.
    */
-  void write(cufile_buffer const &buffer, std::size_t size, std::size_t file_offset) {
+  void write(cufile_buffer const& buffer, std::size_t size, std::size_t file_offset)
+  {
     auto const status = cuFileWrite(cufile_handle_, buffer.device_pointer(), size, file_offset, 0);
 
     if (status < 0) {
@@ -252,7 +259,8 @@ class cufile_file {
    * @param size The number of bytes to append.
    * @return The file offset from which the buffer was appended.
    */
-  std::size_t append(cufile_buffer const &buffer, std::size_t size) {
+  std::size_t append(cufile_buffer const& buffer, std::size_t size)
+  {
     struct stat stat_buffer;
     auto const status = fstat(file_descriptor_, &stat_buffer);
     if (status < 0) {
@@ -264,14 +272,14 @@ class cufile_file {
     return file_offset;
   }
 
-private:
+ private:
   /// The underlying file descriptor.
   int file_descriptor_;
   /// The registered cuFile handle.
   CUfileHandle_t cufile_handle_{};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
@@ -281,7 +289,8 @@ extern "C" {
  * @param env The JNI environment.
  * @return Pointer address to the new driver wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     return reinterpret_cast<jlong>(new cufile_driver());
@@ -295,11 +304,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the driver wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_driver *>(pointer);
+    delete reinterpret_cast<cufile_driver*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -313,13 +322,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jcl
  * @param register_buffer If true, register the cuFile buffer.
  * @return Pointer address to the new buffer wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jclass,
-                                                                jlong device_pointer, jlong size,
-                                                                jboolean register_buffer) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jboolean register_buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *buffer =
-        new cufile_buffer(reinterpret_cast<void *>(device_pointer), size, register_buffer);
+    auto* buffer =
+      new cufile_buffer(reinterpret_cast<void*>(device_pointer), size, register_buffer);
     return reinterpret_cast<jlong>(buffer);
   }
   CATCH_STD(env, 0);
@@ -331,11 +340,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the buffer wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_buffer *>(pointer);
+    delete reinterpret_cast<cufile_buffer*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -347,8 +356,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jcl
  * @param path The file path to read from.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env, jclass,
-                                                                    jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv* env,
+                                                                    jclass,
+                                                                    jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
@@ -365,14 +376,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env,
  * @param file_offset The file offset from which to read.
  * @param buffer Pointer to the cuFile buffer object.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEnv *env, jclass,
-                                                                           jlong file,
-                                                                           jlong file_offset,
-                                                                           jlong buffer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->read(*buffer_ptr, file_offset);
   }
   CATCH_STD(env, );
@@ -385,8 +395,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEn
  * @param path The file path to write to.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env, jclass,
-                                                                     jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv* env,
+                                                                     jclass,
+                                                                     jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
@@ -405,11 +417,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env
  * @param size Number of bytes to write.
  */
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
-    JNIEnv *env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size) {
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->write(*buffer_ptr, size, file_offset);
   }
   CATCH_STD(env, );
@@ -424,14 +437,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
  * @param size Number of bytes to append
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JNIEnv *env, jclass,
-                                                                              jlong file,
-                                                                              jlong buffer,
-                                                                              jlong size) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(
+  JNIEnv* env, jclass, jlong file, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     return file_ptr->append(*buffer_ptr, size);
   }
   CATCH_STD(env, -1);
@@ -443,11 +455,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JN
  * @param env The JNI environment.
  * @param pointer Pointer address to the file handle wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_file *>(pointer);
+    delete reinterpret_cast<cufile_file*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -461,12 +473,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jcl
  * @param device_pointer Pointer address to the device buffer.
  * @param size Number of bytes to write.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclass, jstring path,
-                                                              jlong file_offset,
-                                                              jlong device_pointer, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(
+  JNIEnv* env, jclass, jstring path, jlong file_offset, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     writer->write(buffer, size, file_offset);
   }
@@ -482,11 +494,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
  * @param size Number of bytes to append.
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jclass, jstring path,
-                                                                jlong device_pointer, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(
+  JNIEnv* env, jclass, jstring path, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     return writer->append(buffer, size);
   }
@@ -502,16 +515,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jcl
  * @param path Absolute path of the file to copy from.
  * @param file_offset The file offset from which to copy content.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(JNIEnv *env, jclass,
-                                                               jlong device_pointer, jlong size,
-                                                               jstring path, jlong file_offset) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jstring path, jlong file_offset)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto const reader = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
     reader->read(buffer, file_offset);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index 2fe550cdfeb..127a750db43 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/device_buffer.hpp>
 
 #ifdef CUDF_JNI_ENABLE_PROFILING
@@ -30,21 +31,20 @@ int Cudf_device{cudaInvalidDeviceId};
 
 thread_local int Thread_device = cudaInvalidDeviceId;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
 /** Set the device to use for cudf */
-void set_cudf_device(int device) {
-  Cudf_device = device;
-}
+void set_cudf_device(int device) { Cudf_device = device; }
 
 /**
  * If a cudf device has been specified then this ensures the calling thread
  * is using the same device.
  */
-void auto_set_device(JNIEnv *env) {
+void auto_set_device(JNIEnv* env)
+{
   if (Cudf_device != cudaInvalidDeviceId) {
     if (Thread_device != Cudf_device) {
       cudaError_t cuda_status = cudaSetDevice(Cudf_device);
@@ -55,17 +55,19 @@ void auto_set_device(JNIEnv *env) {
 }
 
 /** Fills all the bytes in the buffer 'buf' with 'value'. */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
-  cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value)
+{
+  cudaError_t cuda_status = cudaMemsetAsync((void*)buf.data(), value, buf.size());
   jni_cuda_check(env, cuda_status);
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclass clazz) {
+JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
 
@@ -73,14 +75,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
     CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
 
     jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;");
-    if (info_class == NULL) {
-      return NULL;
-    }
+    if (info_class == NULL) { return NULL; }
 
     jmethodID ctor_id = env->GetMethodID(info_class, "<init>", "(JJ)V");
-    if (ctor_id == NULL) {
-      return NULL;
-    }
+    if (ctor_id == NULL) { return NULL; }
 
     jobject info_obj = env->NewObject(info_class, ctor_id, (jlong)free, (jlong)total);
     // No need to check for exceptions of null return value as we are just handing the object back
@@ -90,46 +88,51 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jclass, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv* env, jclass, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ret = nullptr;
+    void* ret = nullptr;
     CUDF_CUDA_TRY(cudaMallocHost(&ret, size));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv* env, jclass, jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void *>(ptr)));
+    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void*>(ptr)));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
-                                                       jlong count, jint kind) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Cuda_memset(JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
     CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, jlong dst,
-                                                            jbyte value, jlong count, jint kind) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(
+  JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint dev;
@@ -139,7 +142,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint count;
@@ -149,25 +153,28 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     if (Cudf_device != cudaInvalidDeviceId && dev != Cudf_device) {
-      cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                      "Cannot change device after RMM init");
+      cudf::jni::throw_java_exception(
+        env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init");
     }
     CUDF_CUDA_TRY(cudaSetDevice(dev));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     cudf::jni::auto_set_device(env);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint driver_version;
@@ -177,7 +184,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint runtime_version;
@@ -187,7 +195,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
@@ -199,33 +208,36 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaFree(0));
@@ -233,19 +245,22 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclass,
-                                                              jboolean isNonBlocking) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env,
+                                                              jclass,
+                                                              jboolean isNonBlocking)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudaStream_t stream = nullptr;
-    auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+    auto flags          = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
     CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags));
     return reinterpret_cast<jlong>(stream);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclass, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclass, jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -254,19 +269,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jclass, jlong jstream,
-                                                                jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env,
+                                                                jclass,
+                                                                jlong jstream,
+                                                                jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, jclass,
-                                                                  jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -275,26 +295,25 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env,
+                                                             jclass,
                                                              jboolean enableTiming,
-                                                             jboolean blockingSync) {
+                                                             jboolean blockingSync)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudaEvent_t event = nullptr;
+    cudaEvent_t event  = nullptr;
     unsigned int flags = 0;
-    if (!enableTiming) {
-      flags = flags | cudaEventDisableTiming;
-    }
-    if (blockingSync) {
-      flags = flags | cudaEventBlockingSync;
-    }
+    if (!enableTiming) { flags = flags | cudaEventDisableTiming; }
+    if (blockingSync) { flags = flags | cudaEventBlockingSync; }
     CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags));
     return reinterpret_cast<jlong>(event);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -303,35 +322,39 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto result = cudaEventQuery(event);
     if (result == cudaSuccess) {
       return true;
     } else if (result == cudaErrorNotReady) {
       return false;
-    } // else
+    }  // else
     CUDF_CUDA_TRY(result);
   }
   CATCH_STD(env, false);
   return false;
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, jlong jevent,
-                                                            jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env,
+                                                            jclass,
+                                                            jlong jevent,
+                                                            jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaEventRecord(event, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jclass,
-                                                                 jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -340,19 +363,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                               jlong jsrc, jlong count, jint jkind,
-                                                               jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
     CUDF_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -360,50 +381,51 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                                    jlong jsrc, jlong count,
-                                                                    jint jkind, jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStart();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStop();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -411,4 +433,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index d0a25d449a6..698a8f6ff02 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,27 +14,27 @@
  * limitations under the License.
  */
 
-#include <sstream>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <sstream>
 
 namespace {
 
 // handles detaching a thread from the JVM when the thread terminates
 class jvm_detach_on_destruct {
-public:
-  explicit jvm_detach_on_destruct(JavaVM *jvm) : jvm{jvm} {}
+ public:
+  explicit jvm_detach_on_destruct(JavaVM* jvm) : jvm{jvm} {}
 
   ~jvm_detach_on_destruct() { jvm->DetachCurrentThread(); }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
@@ -49,74 +49,70 @@ static jclass Host_memory_buffer_jclass;
 static jfieldID Host_buffer_address;
 static jfieldID Host_buffer_length;
 
-#define HOST_MEMORY_BUFFER_CLASS "ai/rapids/cudf/HostMemoryBuffer"
+#define HOST_MEMORY_BUFFER_CLASS          "ai/rapids/cudf/HostMemoryBuffer"
 #define HOST_MEMORY_BUFFER_SIG(param_sig) "(" param_sig ")L" HOST_MEMORY_BUFFER_CLASS ";"
 
-static bool cache_host_memory_buffer_jni(JNIEnv *env) {
+static bool cache_host_memory_buffer_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(HOST_MEMORY_BUFFER_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Host_buffer_address = env->GetFieldID(cls, "address", "J");
-  if (Host_buffer_address == nullptr) {
-    return false;
-  }
+  if (Host_buffer_address == nullptr) { return false; }
 
   Host_buffer_length = env->GetFieldID(cls, "length", "J");
-  if (Host_buffer_length == nullptr) {
-    return false;
-  }
+  if (Host_buffer_length == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Host_memory_buffer_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Host_memory_buffer_jclass == nullptr) {
-    return false;
-  }
+  if (Host_memory_buffer_jclass == nullptr) { return false; }
   return true;
 }
 
-static void release_host_memory_buffer_jni(JNIEnv *env) {
+static void release_host_memory_buffer_jni(JNIEnv* env)
+{
   Host_memory_buffer_jclass = del_global_ref(env, Host_memory_buffer_jclass);
 }
 
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
-                             jobject host_memory_allocator) {
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
+                             jobject host_memory_allocator)
+{
   auto const host_memory_allocator_class = env->GetObjectClass(host_memory_allocator);
   auto const allocateMethodId =
-      env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
+    env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
   jobject ret =
-      env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
+    env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
 
-  if (env->ExceptionCheck()) {
-    throw std::runtime_error("allocateHostBuffer threw an exception");
-  }
+  if (env->ExceptionCheck()) { throw std::runtime_error("allocateHostBuffer threw an exception"); }
   return ret;
 }
 
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_address);
 }
 
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_length);
 }
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm) {
-  JNIEnv *env = nullptr;
-  jint rc = jvm->GetEnv(reinterpret_cast<void **>(&env), MINIMUM_JNI_VERSION);
-  if (rc == JNI_OK) {
-    return env;
-  }
+JNIEnv* get_jni_env(JavaVM* jvm)
+{
+  JNIEnv* env = nullptr;
+  jint rc     = jvm->GetEnv(reinterpret_cast<void**>(&env), MINIMUM_JNI_VERSION);
+  if (rc == JNI_OK) { return env; }
   if (rc == JNI_EDETACHED) {
     JavaVMAttachArgs attach_args;
     attach_args.version = MINIMUM_JNI_VERSION;
-    attach_args.name = const_cast<char *>("cudf thread");
-    attach_args.group = NULL;
+    attach_args.name    = const_cast<char*>("cudf thread");
+    attach_args.group   = NULL;
 
-    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void **>(&env), &attach_args) == JNI_OK) {
+    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void**>(&env), &attach_args) == JNI_OK) {
       // use thread_local object to detach the thread from the JVM when thread terminates.
       thread_local jvm_detach_on_destruct detacher(jvm);
     } else {
@@ -129,14 +125,15 @@ JNIEnv *get_jni_env(JavaVM *jvm) {
   throw std::runtime_error("error detecting thread attach state with JVM");
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
-  JNIEnv *env;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void*)
+{
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return JNI_ERR;
   }
 
@@ -186,9 +183,10 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
-JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
-  JNIEnv *env = nullptr;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT void JNI_OnUnload(JavaVM* vm, void*)
+{
+  JNIEnv* env = nullptr;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return;
   }
 
@@ -198,8 +196,9 @@ JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
   cudf::jni::release_host_memory_buffer_jni(env);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv *env, jclass) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, jclass)
+{
   return cudf::jni::is_ptds_enabled;
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
index 8d0e4d36413..af064a4f428 100644
--- a/java/src/main/native/src/DataSourceHelperJni.cpp
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/io/datasource.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/datasource.hpp>
+
 namespace {
 
 #define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
@@ -29,136 +29,127 @@ jmethodID hostReadBuff_method;
 jmethodID onHostBufferDone_method;
 jmethodID deviceRead_method;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
-bool cache_data_source_jni(JNIEnv *env) {
+bool cache_data_source_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(DATA_SOURCE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
-  if (hostRead_method == nullptr) {
-    return false;
-  }
+  if (hostRead_method == nullptr) { return false; }
 
   hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
-  if (hostReadBuff_method == nullptr) {
-    return false;
-  }
+  if (hostReadBuff_method == nullptr) { return false; }
 
   onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
-  if (onHostBufferDone_method == nullptr) {
-    return false;
-  }
+  if (onHostBufferDone_method == nullptr) { return false; }
 
   deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
-  if (deviceRead_method == nullptr) {
-    return false;
-  }
+  if (deviceRead_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (DataSource_jclass == nullptr) {
-    return false;
-  }
+  if (DataSource_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_data_source_jni(JNIEnv *env) {
+void release_data_source_jni(JNIEnv* env)
+{
   DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
 }
 
 class host_buffer_done_callback {
-public:
-  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+ public:
+  explicit host_buffer_done_callback(JavaVM* jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
 
-  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
-  host_buffer_done_callback(host_buffer_done_callback &&other)
-      : jvm(other.jvm), ds(other.ds), id(other.id) {
+  host_buffer_done_callback(host_buffer_done_callback const& other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback&& other)
+    : jvm(other.jvm), ds(other.ds), id(other.id)
+  {
     other.jvm = nullptr;
-    other.ds = nullptr;
-    other.id = -1;
+    other.ds  = nullptr;
+    other.id  = -1;
   }
 
-  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
-  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback&& other)      = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback const& other) = delete;
 
-  ~host_buffer_done_callback() {
+  ~host_buffer_done_callback()
+  {
     // because we are in a destructor we cannot throw an exception, so for now we are
     // just going to keep the java exceptions around and have them be thrown when this
     // thread returns to the JVM. It might be kind of confusing, but we will not lose
     // them.
     if (jvm != nullptr) {
       // We cannot throw an exception in the destructor, so this is really best effort
-      JNIEnv *env = nullptr;
-      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      JNIEnv* env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
         env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
       }
     }
   }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
   jobject ds;
   long id;
 };
 
 class jni_datasource : public cudf::io::datasource {
-public:
-  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
-                          size_t device_read_cutoff)
-      : ds_size(ds_size), device_read_supported(device_read_supported),
-        device_read_cutoff(device_read_cutoff) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_datasource(
+    JNIEnv* env, jobject ds, size_t ds_size, bool device_read_supported, size_t device_read_cutoff)
+    : ds_size(ds_size),
+      device_read_supported(device_read_supported),
+      device_read_cutoff(device_read_cutoff)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
     this->ds = add_global_ref(env, ds);
   }
 
-  virtual ~jni_datasource() {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+  virtual ~jni_datasource()
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       ds = del_global_ref(env, ds);
     }
     ds = nullptr;
   }
 
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlongArray jbuffer_info =
-        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
 
     cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
-    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    auto ptr      = reinterpret_cast<uint8_t*>(buffer_info[0]);
     size_t length = buffer_info[1];
-    long id = buffer_info[2];
+    long id       = buffer_info[2];
 
     cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
-    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
-                                                                                 length);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(
+      std::move(cb), ptr, length);
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlong amount_read =
-        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
     return amount_read;
   }
 
@@ -166,28 +157,38 @@ class jni_datasource : public cudf::io::datasource {
 
   bool supports_device_read() const override { return device_read_supported; }
 
-  bool is_device_read_preferred(size_t size) const override {
+  bool is_device_read_preferred(size_t size) const override
+  {
     return device_read_supported && size >= device_read_cutoff;
   }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst,
-                     rmm::cuda_stream_view stream) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
-    jlong amount_read =
-        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
-                            reinterpret_cast<jlong>(stream.value()));
+    jlong amount_read = env->CallLongMethod(this->ds,
+                                            deviceRead_method,
+                                            offset,
+                                            size,
+                                            reinterpret_cast<jlong>(dst),
+                                            reinterpret_cast<jlong>(stream.value()));
     if (env->ExceptionOccurred()) {
       throw cudf::jni::jni_exception("Java exception in deviceRead");
     }
     return amount_read;
   }
 
-  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
-                                        rmm::cuda_stream_view stream) override {
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
     auto amount_read = device_read(offset, size, dst, stream);
     // This is a bit ugly, but we don't have a good way or a need to return
     // a future for the read
@@ -196,42 +197,48 @@ class jni_datasource : public cudf::io::datasource {
     return ret.get_future();
   }
 
-private:
+ private:
   size_t ds_size;
   bool device_read_supported;
   size_t device_read_cutoff;
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject ds;
 };
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
-    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
-    jlong device_read_cutoff) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(JNIEnv* env,
+                                                             jclass,
+                                                             jobject ds,
+                                                             jlong ds_size,
+                                                             jboolean device_read_supported,
+                                                             jlong device_read_cutoff)
+{
   JNI_NULL_CHECK(env, ds, "Null data source", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto source =
-        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+      new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
     return reinterpret_cast<jlong>(source);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv* env,
                                                                                      jclass,
-                                                                                     jlong handle) {
+                                                                                     jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (handle != 0) {
-      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      auto source = reinterpret_cast<cudf::jni::jni_datasource*>(handle);
       delete (source);
     }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp
index 0f78aef64bc..d4aa08e9a2d 100644
--- a/java/src/main/native/src/HashJoinJni.cpp
+++ b/java/src/main/native/src/HashJoinJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,36 @@
  * limitations under the License.
  */
 
-#include <cudf/join.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/join.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table,
-                                                            jboolean j_nulls_equal) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv* env,
+                                                            jclass,
+                                                            jlong j_table,
+                                                            jboolean j_nulls_equal)
+{
   JNI_NULL_CHECK(env, j_table, "table handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto tview         = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto nulleq        = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     auto hash_join_ptr = new cudf::hash_join(*tview, nulleq);
     return reinterpret_cast<jlong>(hash_join_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv* env, jclass, jlong j_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto hash_join_ptr = reinterpret_cast<cudf::hash_join *>(j_handle);
+    auto hash_join_ptr = reinterpret_cast<cudf::hash_join*>(j_handle);
     delete hash_join_ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
index f9e05d27798..b175920ab4e 100644
--- a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
+++ b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,40 +14,39 @@
  * limitations under the License.
  */
 
+#include "jni_utils.hpp"
+
 #include <errno.h>
 #include <fcntl.h>
 #include <jni.h>
 #include <string.h>
-#include <unistd.h>
-
 #include <sys/mman.h>
 #include <sys/types.h>
-
-#include "jni_utils.hpp"
+#include <unistd.h>
 
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_wrapRangeInBuffer(
-    JNIEnv *env, jclass, jlong addr, jlong len) {
-  return env->NewDirectByteBuffer(reinterpret_cast<void *>(addr), len);
+  JNIEnv* env, jclass, jlong addr, jlong len)
+{
+  return env->NewDirectByteBuffer(reinterpret_cast<void*>(addr), len);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
-    JNIEnv *env, jclass, jstring jpath, jint mode, jlong offset, jlong length) {
+  JNIEnv* env, jclass, jstring jpath, jint mode, jlong offset, jlong length)
+{
   JNI_NULL_CHECK(env, jpath, "path is null", 0);
   JNI_ARG_CHECK(env, (mode == 0 || mode == 1), "bad mode value", 0);
   try {
     cudf::jni::native_jstring path(env, jpath);
 
     int fd = open(path.get(), (mode == 0) ? O_RDONLY : O_RDWR);
-    if (fd == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    if (fd == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
 
-    void *address = mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd, offset);
+    void* address =
+      mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset);
     if (address == MAP_FAILED) {
-      char const *error_msg = strerror(errno);
+      char const* error_msg = strerror(errno);
       close(fd);
       cudf::jni::throw_java_exception(env, "java/io/IOException", error_msg);
     }
@@ -58,17 +57,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong address,
-                                                                              jlong length) {
+                                                                              jlong length)
+{
   JNI_NULL_CHECK(env, address, "address is NULL", );
   try {
-    int rc = munmap(reinterpret_cast<void *>(address), length);
-    if (rc == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    int rc = munmap(reinterpret_cast<void*>(address), length);
+    if (rc == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index e616b7f66be..8937438e922 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <nvcomp.h>
+#include "check_nvcomp_output_sizes.hpp"
+#include "cudf_jni_apis.hpp"
 
-#include <nvcomp/lz4.h>
 #include <rmm/device_uvector.hpp>
 
-#include "check_nvcomp_output_sizes.hpp"
-#include "cudf_jni_apis.hpp"
+#include <nvcomp.h>
+#include <nvcomp/lz4.h>
+#include <nvcomp/zstd.h>
 
 namespace {
 
-constexpr char const *NVCOMP_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompException";
-constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException";
+constexpr char const* NVCOMP_ERROR_CLASS      = "ai/rapids/cudf/nvcomp/NvcompException";
+constexpr char const* NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
+constexpr char const* ILLEGAL_ARG_CLASS       = "java/lang/IllegalArgumentException";
+constexpr char const* UNSUPPORTED_CLASS       = "java/lang/UnsupportedOperationException";
 
-void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
+void check_nvcomp_status(JNIEnv* env, nvcompStatus_t status)
+{
   switch (status) {
     case nvcompSuccess: break;
     case nvcompErrorInvalidValue:
@@ -52,19 +54,21 @@ void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
   }
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
+// methods for lz4
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t temp_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size,
-                                                      nvcompBatchedLZ4DefaultOpts, &temp_size);
+    auto status           = nvcompBatchedLZ4CompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedLZ4DefaultOpts, &temp_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(temp_size);
   }
@@ -72,49 +76,68 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressG
 }
 
 JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(
-    JNIEnv *env, jclass, jlong j_max_chunk_size) {
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t max_output_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
+    auto status                 = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(max_output_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env,
+                                                             jclass,
+                                                             jlong j_in_ptrs,
+                                                             jlong j_in_sizes,
+                                                             jlong j_chunk_size,
+                                                             jlong j_batch_size,
+                                                             jlong j_temp_ptr,
+                                                             jlong j_temp_size,
+                                                             jlong j_out_ptrs,
+                                                             jlong j_compressed_sizes_out_ptr,
+                                                             jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto in_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto in_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto out_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto compressed_out_sizes = reinterpret_cast<std::size_t *>(j_compressed_sizes_out_ptr);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr,
-                                                temp_size, out_ptrs, compressed_out_sizes,
-                                                nvcompBatchedLZ4DefaultOpts, stream);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedLZ4CompressAsync(in_ptrs,
+                                                in_sizes,
+                                                chunk_size,
+                                                batch_size,
+                                                temp_ptr,
+                                                temp_size,
+                                                out_ptrs,
+                                                compressed_out_sizes,
+                                                nvcompBatchedLZ4DefaultOpts,
+                                                stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
     std::size_t temp_size = 0;
     auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size);
     check_nvcomp_status(env, status);
@@ -123,49 +146,225 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4Decompres
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(JNIEnv* env,
+                                                               jclass,
+                                                               jlong j_in_ptrs,
+                                                               jlong j_in_sizes,
+                                                               jlong j_out_sizes,
+                                                               jlong j_batch_size,
+                                                               jlong j_temp_ptr,
+                                                               jlong j_temp_size,
+                                                               jlong j_out_ptrs,
+                                                               jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
+    auto status                    = nvcompBatchedLZ4DecompressAsync(compressed_ptrs,
+                                                  compressed_sizes,
+                                                  uncompressed_sizes,
+                                                  actual_uncompressed_sizes.data(),
+                                                  batch_size,
+                                                  temp_ptr,
+                                                  temp_size,
+                                                  uncompressed_ptrs,
+                                                  uncompressed_statuses.data(),
+                                                  stream);
+    check_nvcomp_status(env, status);
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
+    }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong j_in_ptrs,
+                                                                      jlong j_in_sizes,
+                                                                      jlong j_out_sizes,
+                                                                      jlong j_batch_size,
+                                                                      jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedLZ4GetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
+// methods for zstd
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t temp_size = 0;
+    auto status           = nvcompBatchedZstdCompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedZstdDefaultOpts, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetMaxOutputChunkSize(
+  JNIEnv* env, jclass, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t max_output_size = 0;
+    auto status                 = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedZstdDefaultOpts, &max_output_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(max_output_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env,
+                                                              jclass,
+                                                              jlong j_in_ptrs,
+                                                              jlong j_in_sizes,
+                                                              jlong j_chunk_size,
+                                                              jlong j_batch_size,
+                                                              jlong j_temp_ptr,
+                                                              jlong j_temp_size,
+                                                              jlong j_out_ptrs,
+                                                              jlong j_compressed_sizes_out_ptr,
+                                                              jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedZstdCompressAsync(in_ptrs,
+                                                 in_sizes,
+                                                 chunk_size,
+                                                 batch_size,
+                                                 temp_ptr,
+                                                 temp_size,
+                                                 out_ptrs,
+                                                 compressed_out_sizes,
+                                                 nvcompBatchedZstdDefaultOpts,
+                                                 stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedZstdDecompressGetTempSize(batch_size, chunk_size, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressAsync(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_in_ptrs,
+                                                                jlong j_in_sizes,
+                                                                jlong j_out_sizes,
+                                                                jlong j_batch_size,
+                                                                jlong j_temp_ptr,
+                                                                jlong j_temp_size,
+                                                                jlong j_out_ptrs,
+                                                                jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t const *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto uncompressed_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto uncompressed_statuses = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
     auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
-    auto status = nvcompBatchedLZ4DecompressAsync(
-        compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(),
-        batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream);
+    auto status                    = nvcompBatchedZstdDecompressAsync(compressed_ptrs,
+                                                   compressed_sizes,
+                                                   uncompressed_sizes,
+                                                   actual_uncompressed_sizes.data(),
+                                                   batch_size,
+                                                   temp_ptr,
+                                                   temp_size,
+                                                   uncompressed_ptrs,
+                                                   uncompressed_statuses.data(),
+                                                   stream);
     check_nvcomp_status(env, status);
-    if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(),
-                                               batch_size, stream)) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS,
-                                      "nvcomp decompress output size mismatch");
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
     }
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdGetDecompressSizeAsync(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_in_ptrs,
+                                                                       jlong j_in_sizes,
+                                                                       jlong j_out_sizes,
+                                                                       jlong j_batch_size,
+                                                                       jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes,
-                                                         uncompressed_sizes, batch_size, stream);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedZstdGetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxRangeJni.cpp b/java/src/main/native/src/NvtxRangeJni.cpp
index 2529acfb91d..4ba6be31b87 100644
--- a/java/src/main/native/src/NvtxRangeJni.cpp
+++ b/java/src/main/native/src/NvtxRangeJni.cpp
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass clazz, jstring name,
-                                                          jint color_bits) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jstring name,
+                                                          jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
@@ -32,11 +35,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass cl
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv* env, jclass clazz)
+{
   try {
     nvtxDomainRangePop(nvtx3::domain::get<cudf::jni::java_domain>());
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxUniqueRangeJni.cpp b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
index 924b5a564e6..2ff96f96497 100644
--- a/java/src/main/native/src/NvtxUniqueRangeJni.cpp
+++ b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
@@ -14,28 +14,33 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv *env, jclass clazz,
-                                                                  jstring name, jint color_bits) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jstring name,
+                                                                  jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
     nvtx3::event_attributes attr{range_color, range_name.get()};
     auto nvtxRangeId =
-        nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
+      nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
     return static_cast<jlong>(nvtxRangeId);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jclass clazz,
-                                                               jlong nvtxRangeId) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong nvtxRangeId)
+{
   try {
     nvtxDomainRangeEnd(nvtx3::domain::get<cudf::jni::java_domain>(),
                        static_cast<nvtxRangeId_t>(nvtxRangeId));
@@ -43,4 +48,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/PackedColumnMetadataJni.cpp b/java/src/main/native/src/PackedColumnMetadataJni.cpp
index 7ec3e1294ce..c7c95558e71 100644
--- a/java/src/main/native/src/PackedColumnMetadataJni.cpp
+++ b/java/src/main/native/src/PackedColumnMetadataJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,23 +19,26 @@
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_createMetadataDirectBuffer(
-    JNIEnv *env, jclass, jlong j_metadata_ptr) {
+  JNIEnv* env, jclass, jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", nullptr);
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
-    return env->NewDirectByteBuffer(const_cast<uint8_t *>(metadata->data()), metadata->size());
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
+    return env->NewDirectByteBuffer(const_cast<uint8_t*>(metadata->data()), metadata->size());
   }
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT void JNICALL
-Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv *env, jclass, jlong j_metadata_ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", );
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
     delete metadata;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 68af350d5fe..fa78f6ca4e2 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -14,14 +14,11 @@
  * limitations under the License.
  */
 
-#include <atomic>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <mutex>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/io/memory_resource.hpp>
+
+#include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -33,8 +30,14 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <atomic>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <mutex>
 
 using rmm::mr::device_memory_resource;
 using rmm::mr::logging_resource_adaptor;
@@ -42,14 +45,14 @@ using rmm_pinned_pool_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_mem
 
 namespace {
 
-constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
+constexpr char const* RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 
 /**
  * @brief Base class so we can template tracking_resource_adaptor but
  * still hold all instances of it without issues.
  */
 class base_tracking_resource_adaptor : public device_memory_resource {
-public:
+ public:
   virtual std::size_t get_total_allocated() = 0;
 
   virtual std::size_t get_max_total_allocated() = 0;
@@ -69,7 +72,7 @@ class base_tracking_resource_adaptor : public device_memory_resource {
  */
 template <typename Upstream>
 class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
-public:
+ public:
   /**
    * @brief Constructs a new tracking resource adaptor that delegates to
    * `mr` for all allocation operations while tracking the amount of memory
@@ -79,28 +82,32 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
    * @param size_alignment The alignment to which the `mr` resource will
    * round up all memory allocation size requests.
    */
-  tracking_resource_adaptor(Upstream *mr, std::size_t size_alignment)
-      : resource{mr}, size_align{size_alignment} {}
+  tracking_resource_adaptor(Upstream* mr, std::size_t size_alignment)
+    : resource{mr}, size_align{size_alignment}
+  {
+  }
 
-  Upstream *get_wrapped_resource() { return resource; }
+  Upstream* get_wrapped_resource() { return resource; }
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
   std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
-  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
-    scoped_allocated = initial_value;
+    scoped_allocated           = initial_value;
     scoped_max_total_allocated = initial_value;
   }
 
-  std::size_t get_scoped_max_total_allocated() override {
+  std::size_t get_scoped_max_total_allocated() override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
     return scoped_max_total_allocated;
   }
 
-private:
-  Upstream *const resource;
+ private:
+  Upstream* const resource;
   std::size_t const size_align;
   // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
@@ -118,7 +125,8 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::mutex max_total_allocated_mutex;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
 
@@ -127,13 +135,14 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
       total_allocated += num_bytes;
       scoped_allocated += num_bytes;
       std::scoped_lock lock(max_total_allocated_mutex);
-      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      max_total_allocated        = std::max(total_allocated.load(), max_total_allocated);
       scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     size = (size + size_align - 1) / size_align * size_align;
 
     resource->deallocate(p, size, stream);
@@ -146,8 +155,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 };
 
 template <typename Upstream>
-tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
-                                                           std::size_t size_alignment) {
+tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
+                                                           std::size_t size_alignment)
+{
   return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
 }
 
@@ -156,24 +166,23 @@ tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
  */
 class java_event_handler_memory_resource : public device_memory_resource {
-public:
-  java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds,
+ public:
+  java_event_handler_memory_resource(JNIEnv* env,
+                                     jobject jhandler,
+                                     jlongArray jalloc_thresholds,
                                      jlongArray jdealloc_thresholds,
-                                     device_memory_resource *resource_to_wrap,
-                                     base_tracking_resource_adaptor *tracker)
-      : resource(resource_to_wrap), tracker(tracker) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+                                     device_memory_resource* resource_to_wrap,
+                                     base_tracking_resource_adaptor* tracker)
+    : resource(resource_to_wrap), tracker(tracker)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
     on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(JI)Z");
     if (on_alloc_fail_method == nullptr) {
       use_old_alloc_fail_interface = true;
-      on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
+      on_alloc_fail_method         = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
       if (on_alloc_fail_method == nullptr) {
         throw cudf::jni::jni_exception("onAllocFailure method");
       }
@@ -195,22 +204,23 @@ class java_event_handler_memory_resource : public device_memory_resource {
     handler_obj = cudf::jni::add_global_ref(env, jhandler);
   }
 
-  virtual ~java_event_handler_memory_resource() {
+  virtual ~java_event_handler_memory_resource()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       handler_obj = cudf::jni::del_global_ref(env, handler_obj);
     }
     handler_obj = nullptr;
   }
 
-  device_memory_resource *get_wrapped_resource() { return resource; }
+  device_memory_resource* get_wrapped_resource() { return resource; }
 
-private:
-  device_memory_resource *const resource;
-  base_tracking_resource_adaptor *const tracker;
+ private:
+  device_memory_resource* const resource;
+  base_tracking_resource_adaptor* const tracker;
   jmethodID on_alloc_fail_method;
   bool use_old_alloc_fail_interface;
   jmethodID on_alloc_threshold_method;
@@ -220,8 +230,10 @@ class java_event_handler_memory_resource : public device_memory_resource {
   std::vector<std::size_t> alloc_thresholds{};
   std::vector<std::size_t> dealloc_thresholds{};
 
-  static void update_thresholds(JNIEnv *env, std::vector<std::size_t> &thresholds,
-                                jlongArray from_java) {
+  static void update_thresholds(JNIEnv* env,
+                                std::vector<std::size_t>& thresholds,
+                                jlongArray from_java)
+  {
     thresholds.clear();
     if (from_java != nullptr) {
       cudf::jni::native_jlongArray jvalues(env, from_java);
@@ -232,17 +244,19 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-  bool on_alloc_fail(std::size_t num_bytes, int retry_count) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  bool on_alloc_fail(std::size_t num_bytes, int retry_count)
+  {
+    JNIEnv* env     = cudf::jni::get_jni_env(jvm);
     jboolean result = false;
     if (!use_old_alloc_fail_interface) {
-      result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes),
-                                 static_cast<jint>(retry_count));
+      result = env->CallBooleanMethod(handler_obj,
+                                      on_alloc_fail_method,
+                                      static_cast<jlong>(num_bytes),
+                                      static_cast<jint>(retry_count));
 
     } else {
       result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
+        env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
     }
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocFailure handler threw an exception");
@@ -250,16 +264,20 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void check_for_threshold_callback(std::size_t low, std::size_t high,
-                                    std::vector<std::size_t> const &thresholds,
-                                    jmethodID callback_method, char const *callback_name,
-                                    std::size_t current_total) {
+  void check_for_threshold_callback(std::size_t low,
+                                    std::size_t high,
+                                    std::vector<std::size_t> const& thresholds,
+                                    jmethodID callback_method,
+                                    char const* callback_name,
+                                    std::size_t current_total)
+  {
     if (high >= thresholds.front() && low < thresholds.back()) {
       // could use binary search, but assumption is threshold count is very small
-      auto it = std::find_if(thresholds.begin(), thresholds.end(),
-                             [=](std::size_t t) -> bool { return low < t && high >= t; });
+      auto it = std::find_if(thresholds.begin(), thresholds.end(), [=](std::size_t t) -> bool {
+        return low < t && high >= t;
+      });
       if (it != thresholds.end()) {
-        JNIEnv *env = cudf::jni::get_jni_env(jvm);
+        JNIEnv* env = cudf::jni::get_jni_env(jvm);
         env->CallVoidMethod(handler_obj, callback_method, current_total);
         if (env->ExceptionCheck()) {
           throw std::runtime_error("onAllocThreshold handler threw an exception");
@@ -268,13 +286,14 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-protected:
-  JavaVM *jvm;
+ protected:
+  JavaVM* jvm;
   jobject handler_obj;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     std::size_t total_before;
-    void *result;
+    void* result;
     // a non-zero retry_count signifies that the `on_alloc_fail`
     // callback is being invoked while re-attempting an allocation
     // that had previously failed.
@@ -282,20 +301,22 @@ class java_event_handler_memory_resource : public device_memory_resource {
     while (true) {
       try {
         total_before = tracker->get_total_allocated();
-        result = resource->allocate(num_bytes, stream);
+        result       = resource->allocate(num_bytes, stream);
         break;
-      } catch (rmm::out_of_memory const &e) {
-        if (!on_alloc_fail(num_bytes, retry_count++)) {
-          throw;
-        }
+      } catch (rmm::out_of_memory const& e) {
+        if (!on_alloc_fail(num_bytes, retry_count++)) { throw; }
       }
     }
     auto total_after = tracker->get_total_allocated();
 
     try {
-      check_for_threshold_callback(total_before, total_after, alloc_thresholds,
-                                   on_alloc_threshold_method, "onAllocThreshold", total_after);
-    } catch (std::exception const &e) {
+      check_for_threshold_callback(total_before,
+                                   total_after,
+                                   alloc_thresholds,
+                                   on_alloc_threshold_method,
+                                   "onAllocThreshold",
+                                   total_after);
+    } catch (std::exception const& e) {
       // Free the allocation as app will think the exception means the memory was not allocated.
       resource->deallocate(result, num_bytes, stream);
       throw;
@@ -304,33 +325,36 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     auto total_before = tracker->get_total_allocated();
     resource->deallocate(p, size, stream);
     auto total_after = tracker->get_total_allocated();
-    check_for_threshold_callback(total_after, total_before, dealloc_thresholds,
-                                 on_dealloc_threshold_method, "onDeallocThreshold", total_after);
+    check_for_threshold_callback(total_after,
+                                 total_before,
+                                 dealloc_thresholds,
+                                 on_dealloc_threshold_method,
+                                 "onDeallocThreshold",
+                                 total_after);
   }
 };
 
 class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource {
-public:
-  java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler,
+ public:
+  java_debug_event_handler_memory_resource(JNIEnv* env,
+                                           jobject jhandler,
                                            jlongArray jalloc_thresholds,
                                            jlongArray jdealloc_thresholds,
-                                           device_memory_resource *resource_to_wrap,
-                                           base_tracking_resource_adaptor *tracker)
-      : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds,
-                                           resource_to_wrap, tracker) {
+                                           device_memory_resource* resource_to_wrap,
+                                           base_tracking_resource_adaptor* tracker)
+    : java_event_handler_memory_resource(
+        env, jhandler, jalloc_thresholds, jdealloc_thresholds, resource_to_wrap, tracker)
+  {
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V");
-    if (on_allocated_method == nullptr) {
-      throw cudf::jni::jni_exception("onAllocated method");
-    }
+    if (on_allocated_method == nullptr) { throw cudf::jni::jni_exception("onAllocated method"); }
 
     on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V");
     if (on_deallocated_method == nullptr) {
@@ -338,36 +362,41 @@ class java_debug_event_handler_memory_resource final : public java_event_handler
     }
   }
 
-private:
+ private:
   jmethodID on_allocated_method;
   jmethodID on_deallocated_method;
 
-  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes);
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocated handler threw an exception");
     }
   }
 
-  void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_deallocated_callback(void* p, std::size_t size, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_deallocated_method, size);
   }
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
-    void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
+    void* result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
     on_allocated_callback(num_bytes, stream);
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     java_event_handler_memory_resource::do_deallocate(p, size, stream);
     on_deallocated_callback(p, size, stream);
   }
 };
 
-inline auto &prior_cuio_host_mr() {
+inline auto& prior_cuio_host_mr()
+{
   static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource();
   return _prior_cuio_host_mr;
 }
@@ -382,18 +411,19 @@ inline auto &prior_cuio_host_mr() {
  * Most of this comes directly from `pinned_host_memory_resource` in RMM.
  */
 class pinned_fallback_host_memory_resource {
-private:
-  rmm_pinned_pool_t *_pool;
-  void *pool_begin_;
-  void *pool_end_;
-
-public:
-  pinned_fallback_host_memory_resource(rmm_pinned_pool_t *pool) : _pool(pool) {
+ private:
+  rmm_pinned_pool_t* _pool;
+  void* pool_begin_;
+  void* pool_end_;
+
+ public:
+  pinned_fallback_host_memory_resource(rmm_pinned_pool_t* pool) : _pool(pool)
+  {
     // allocate from the pinned pool the full size to figure out
     // our beginning and end address.
     auto pool_size = pool->pool_size();
-    pool_begin_ = pool->allocate(pool_size);
-    pool_end_ = static_cast<void *>(static_cast<uint8_t *>(pool_begin_) + pool_size);
+    pool_begin_    = pool->allocate(pool_size);
+    pool_end_      = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size);
     pool->deallocate(pool_begin_, pool_size);
   }
 
@@ -413,11 +443,12 @@ class pinned_fallback_host_memory_resource {
    *
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate(std::size_t bytes,
-                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) {
+  void* allocate(std::size_t bytes,
+                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
     try {
       return _pool->allocate(bytes, alignment);
-    } catch (const std::exception &unused) {
+    } catch (const std::exception& unused) {
       // try to allocate using the underlying pinned resource
       return prior_cuio_host_mr().allocate(bytes, alignment);
     }
@@ -434,8 +465,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param alignment Alignment in bytes. Default alignment is used if unspecified.
    */
-  void deallocate(void *ptr, std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept {
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
     if (ptr >= pool_begin_ && ptr <= pool_end_) {
       _pool->deallocate(ptr, bytes, alignment);
     } else {
@@ -457,7 +490,8 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes);
   }
 
@@ -476,8 +510,10 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, std::size_t alignment,
-                       [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes, alignment);
   }
 
@@ -490,8 +526,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes);
   }
 
@@ -506,8 +544,11 @@ class pinned_fallback_host_memory_resource {
    * @param alignment Alignment in bytes.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes, std::size_t alignment,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes, alignment);
   }
   // NOLINTEND(bugprone-easily-swappable-parameters)
@@ -515,44 +556,49 @@ class pinned_fallback_host_memory_resource {
   /**
    * @briefreturn{true if the specified resource is the same type as this resource.}
    */
-  bool operator==(const pinned_fallback_host_memory_resource &) const { return true; }
+  bool operator==(const pinned_fallback_host_memory_resource&) const { return true; }
 
   /**
    * @briefreturn{true if the specified resource is not the same type as this resource, otherwise
    * false.}
    */
-  bool operator!=(const pinned_fallback_host_memory_resource &) const { return false; }
+  bool operator!=(const pinned_fallback_host_memory_resource&) const { return false; }
 
   /**
    * @brief Enables the `cuda::mr::device_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides device accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::device_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+  }
 
   /**
    * @brief Enables the `cuda::mr::host_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides host accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::host_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+  }
 };
 
 // carryover from RMM pinned_host_memory_resource
-static_assert(
-    cuda::mr::async_resource_with<pinned_fallback_host_memory_resource, cuda::mr::device_accessible,
-                                  cuda::mr::host_accessible>);
+static_assert(cuda::mr::async_resource_with<pinned_fallback_host_memory_resource,
+                                            cuda::mr::device_accessible,
+                                            cuda::mr::host_accessible>);
 
 // we set this to our fallback resource if we have set it.
 std::unique_ptr<pinned_fallback_host_memory_resource> pinned_fallback_mr;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   // make sure the CUDA device is setup in the context
   cudaError_t cuda_status = cudaFree(0);
   cudf::jni::jni_cuda_check(env, cuda_status);
@@ -564,66 +610,78 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env
   cudf::jni::set_cudf_device(device_id);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   cudf::jni::set_cudf_device(cudaInvalidDeviceId);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
-                                                              jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong size,
+                                                              jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    void *ret = mr->allocate(size, c_stream);
+    void* ret     = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_free(JNIEnv *env, jclass clazz, jlong ptr,
-                                                    jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_free(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
-    void *cptr = reinterpret_cast<void *>(ptr);
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+    void* cptr                        = reinterpret_cast<void*>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    mr->deallocate(cptr, size, c_stream);
+    mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jclass clazz,
-                                                                jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_buffer *cptr = reinterpret_cast<rmm::device_buffer *>(ptr);
+    rmm::device_buffer* cptr = reinterpret_cast<rmm::device_buffer*>(ptr);
     delete cptr;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv *env, jclass clazz,
-                                                                  jlong size, jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong size,
+                                                                  jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ptr{nullptr};
+    void* ptr{nullptr};
     RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, size));
     return reinterpret_cast<jlong>(ptr);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv *env, jclass clazz, jlong ptr,
-                                                        jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     RMM_ASSERT_CUDA_SUCCESS(cudaFree(cptr));
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *env, jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_memory_resource();
@@ -632,18 +690,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *en
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv *env,
-                                                                         jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::managed_memory_resource();
@@ -652,70 +712,77 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong ptr) {
+                                                                            jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                      jlong child, jlong init,
-                                                                      jlong max) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jlong max)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     auto ret =
-        new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
+      new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                       jlong child, jlong init,
-                                                                       jboolean dump_on_oom) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jboolean dump_on_oom)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(wrapped, init,
-                                                                                   dump_on_oom);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(
+      wrapped, init, dump_on_oom);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv *env,
-                                                                           jclass clazz, jlong init,
-                                                                           jlong release) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
+                                                                           jclass clazz,
+                                                                           jlong init,
+                                                                           jlong release)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
@@ -724,71 +791,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEn
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(JNIEnv *env,
-                                                                           jclass clazz,
-                                                                           jlong child, jlong limit,
-                                                                           jlong align) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jlong limit, jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
-        wrapped, limit, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
+      wrapped, limit, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> *>(
-            ptr);
+      reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv *env, jclass clazz,
-                                                                          jlong child, jint type,
-                                                                          jstring jpath,
-                                                                          jboolean auto_flush) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jint type, jstring jpath, jboolean auto_flush)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     switch (type) {
-      case 1: // File
+      case 1:  // File
       {
         cudf::jni::native_jstring path(env, jpath);
         auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
-            wrapped, path.get(), auto_flush);
+          wrapped, path.get(), auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 2: // stdout
+      case 2:  // stdout
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cout,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cout, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 3: // stderr
+      case 3:  // stderr
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cerr,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cerr, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
       default: throw std::logic_error("unsupported logging location type");
@@ -797,108 +863,121 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv* env,
                                                                            jclass clazz,
                                                                            jlong child,
-                                                                           jlong align) {
+                                                                           jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong ptr) {
+                                                                                jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                        jclass clazz,
                                                                                        jlong ptr,
-                                                                                       jlong init) {
+                                                                                       jlong init)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     mr->reset_scoped_max_total_allocated(init);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                       jclass clazz,
-                                                                                      jlong ptr) {
+                                                                                      jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_scoped_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass, jlong child, jlong tracker, jobject handler_obj,
-    jlongArray jalloc_thresholds, jlongArray jdealloc_thresholds, jboolean enable_debug) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(JNIEnv* env,
+                                                       jclass,
+                                                       jlong child,
+                                                       jlong tracker,
+                                                       jobject handler_obj,
+                                                       jlongArray jalloc_thresholds,
+                                                       jlongArray jdealloc_thresholds,
+                                                       jboolean enable_debug)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   JNI_NULL_CHECK(env, tracker, "tracker is null", 0);
   try {
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto t =
-        reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(tracker);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto t = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(tracker);
     if (enable_debug) {
-      auto ret = new java_debug_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                              jdealloc_thresholds, wrapped, t);
+      auto ret = new java_debug_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     } else {
-      auto ret = new java_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                        jdealloc_thresholds, wrapped, t);
+      auto ret = new java_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     }
   }
@@ -906,34 +985,38 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass clazz, jlong ptr, jboolean enable_debug) {
+  JNIEnv* env, jclass clazz, jlong ptr, jboolean enable_debug)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (enable_debug) {
-      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource*>(ptr);
       delete mr;
     } else {
-      auto mr = reinterpret_cast<java_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_event_handler_memory_resource*>(ptr);
       delete mr;
     }
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong new_handle) {
+                                                                                jlong new_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::device_memory_resource *>(new_handle);
+    auto mr = reinterpret_cast<rmm::mr::device_memory_resource*>(new_handle);
     rmm::mr::set_current_device_resource(mr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong init, jlong max) {
+                                                                            jlong init,
+                                                                            jlong max)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto pool = new rmm_pinned_pool_t(new rmm::mr::pinned_host_memory_resource(), init, max);
@@ -942,74 +1025,95 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
-    // set the cuio host mr and store the prior resource in our static variable
     prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
     // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
     cudf::io::set_host_memory_resource(prior_cuio_host_mr());
     pinned_fallback_mr.reset();
-    delete reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                    jlong pool_ptr, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv* env,
+                                                                    jclass clazz,
+                                                                    jlong pool_ptr,
+                                                                    jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *ret = pool->allocate(size);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* ret = pool->allocate(size);
     return reinterpret_cast<jlong>(ret);
-  } catch (const std::exception &unused) { return -1; }
+  } catch (const std::exception& unused) {
+    return -1;
+  }
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                  jlong pool_ptr, jlong ptr,
-                                                                  jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(
+  JNIEnv* env, jclass clazz, jlong pool_ptr, jlong ptr, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    auto pool  = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     pool->deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
 
 // only for tests
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong size) {
+                                                                            jlong size)
+{
   cudf::jni::auto_set_device(env);
-  void *ret = cudf::io::get_host_memory_resource().allocate(size);
+  void* ret = cudf::io::get_host_memory_resource().allocate(size);
   return reinterpret_cast<jlong>(ret);
 }
 
 // only for tests
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr,
+                                                                          jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     cudf::io::get_host_memory_resource().deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
+                                                                                      jclass clazz,
+                                                                                      jlong size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+  }
+  CATCH_STD(env, false)
+}
 }
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index e47728f6acc..6a1ad1a9f32 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -21,135 +24,149 @@
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv *env, jclass,
-                                                              jlong scalar_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv* env,
+                                                              jclass,
+                                                              jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     delete s;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv *env, jclass,
-                                                                    jlong scalar_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     return static_cast<jboolean>(s->is_valid());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int8_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jbyte>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int16_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jshort>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv *env, jclass, jlong scalar_handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int32_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jint>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int64_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jlong>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<float>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jfloat>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv *env, jclass,
-                                                               jlong scalar_handle) {
+JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv* env,
+                                                               jclass,
+                                                               jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<double>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jdouble>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv *env, jclass,
-                                                                           jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<__int128_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
-    auto val = s->value();
-    jbyte const *ptr = reinterpret_cast<jbyte const *>(&val);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
+    auto val         = s->value();
+    jbyte const* ptr = reinterpret_cast<jbyte const*>(&val);
     cudf::jni::native_jbyteArray jbytes{env, ptr, sizeof(__int128_t)};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jclass,
-                                                                jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv* env,
+                                                                jclass,
+                                                                jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::string_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::string_scalar*>(scalar_handle);
     std::string val{s->to_string()};
     if (val.size() > 0x7FFFFFFF) {
-      cudf::jni::throw_java_exception(env, "java/lang/IllegalArgumentException",
-                                      "string scalar too large");
+      cudf::jni::throw_java_exception(
+        env, "java/lang/IllegalArgumentException", "string scalar too large");
     }
-    cudf::jni::native_jbyteArray jbytes{env, reinterpret_cast<jbyte const *>(val.data()),
-                                        static_cast<int>(val.size())};
+    cudf::jni::native_jbyteArray jbytes{
+      env, reinterpret_cast<jbyte const*>(val.data()), static_cast<int>(val.size())};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *env, jclass,
-                                                                       jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::list_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::list_scalar*>(scalar_handle);
     // Creates a column view in heap with the stack one, to let JVM take care of its
     // life cycle.
     return reinterpret_cast<jlong>(new cudf::column_view(s->view()));
@@ -158,12 +175,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *e
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlong scalar_handle) {
+Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv* env, jclass, jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto s = reinterpret_cast<cudf::struct_scalar *>(scalar_handle);
-    const cudf::table_view &table = s->view();
+    const auto s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
+    const cudf::table_view& table = s->view();
     cudf::jni::native_jpointerArray<cudf::column_view> column_handles(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       column_handles[i] = new cudf::column_view(table.column(i));
@@ -173,215 +191,246 @@ Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlon
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jboolean value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      int8_t val = value ? 1 : 0;
-      static_cast<ScalarType *>(s.get())->set_value(val);
+      int8_t val       = value ? 1 : 0;
+      static_cast<ScalarType*>(s.get())->set_value(val);
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                  jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv* env,
+                                                                  jclass,
+                                                                  jbyte value,
+                                                                  jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jbyte value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jshort value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jshort value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jint value,
-                                                                          jboolean is_valid) {
+                                                                          jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
+      cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv *env, jclass, jint value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv *env, jclass, jint value,
-                                                                    jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv* env,
+                                                                    jclass,
+                                                                    jint value,
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv *env, jclass, jlong value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jfloat value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<float>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<float>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<float>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jdouble value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<double>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<double>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<double>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jbyteArray value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::string strval;
     if (is_valid) {
       cudf::jni::native_jbyteArray jbytes{env, value};
-      strval.assign(reinterpret_cast<char *>(jbytes.data()), jbytes.size());
+      strval.assign(reinterpret_cast<char*>(jbytes.data()), jbytes.size());
     }
 
     auto s = new cudf::string_scalar{strval, static_cast<bool>(is_valid)};
@@ -390,117 +439,116 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv* env,
+                                                                           jclass,
                                                                            jint value,
-                                                                           jboolean is_valid) {
+                                                                           jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
+      cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(JNIEnv *env, jclass,
-                                                                          jint jdtype_id,
-                                                                          jlong value,
-                                                                          jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_duration_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(JNIEnv *env, jclass,
-                                                                           jint jdtype_id,
-                                                                           jlong value,
-                                                                           jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_timestamp_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(JNIEnv *env, jclass,
-                                                                       jint value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(
+  JNIEnv* env, jclass, jint value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int32_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *env, jclass,
-                                                                       jlong value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(
+  JNIEnv* env, jclass, jlong value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int64_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(JNIEnv *env, jclass,
-                                                                        jbyteArray value,
-                                                                        jint scale,
-                                                                        jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(
+  JNIEnv* env, jclass, jbyteArray value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     cudf::jni::native_jbyteArray jbytes{env, value};
-    auto const value_ = reinterpret_cast<__int128_t *>(jbytes.data());
+    auto const value_ = reinterpret_cast<__int128_t*>(jbytes.data());
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclass, jlong lhs_ptr,
-                                                              jlong rhs_view, jint int_op,
-                                                              jint out_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(
+  JNIEnv* env, jclass, jlong lhs_ptr, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_ptr, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *lhs = reinterpret_cast<cudf::scalar *>(lhs_ptr);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    cudf::scalar* lhs           = reinterpret_cast<cudf::scalar*>(lhs_ptr);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, rhs->size(), cudf::mask_state::UNALLOCATED);
@@ -512,10 +560,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto lhs_col = cudf::make_column_from_scalar(*lhs, 1);
+      auto lhs_col  = cudf::make_column_from_scalar(*lhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
+        out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -524,28 +572,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv* env,
+                                                                  jclass,
                                                                   jlong view_handle,
-                                                                  jboolean is_valid) {
+                                                                  jboolean is_valid)
+{
   JNI_NULL_CHECK(env, view_handle, "Column view should NOT be null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col_view = reinterpret_cast<cudf::column_view *>(view_handle);
+    auto col_view = reinterpret_cast<cudf::column_view*>(view_handle);
 
     // Instead of calling the `cudf::empty_like` to create an empty column when `is_valid`
     // is false, always passes the input view to the scalar, to avoid copying the column
     // twice.
     // Let the Java layer make sure the view is empty when `is_valid` is false.
-    cudf::scalar *s = new cudf::list_scalar(*col_view);
+    cudf::scalar* s = new cudf::list_scalar(*col_view);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlongArray handles,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
@@ -553,24 +605,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env,
     cudf::jni::native_jpointerArray<cudf::column_view> column_pointers(env, handles);
     std::vector<cudf::column_view> columns;
     columns.reserve(column_pointers.size());
-    std::transform(column_pointers.data(), column_pointers.data() + column_pointers.size(),
-                   std::back_inserter(columns), [](auto const &col_ptr) { return *col_ptr; });
+    std::transform(column_pointers.data(),
+                   column_pointers.data() + column_pointers.size(),
+                   std::back_inserter(columns),
+                   [](auto const& col_ptr) { return *col_ptr; });
     auto s = std::make_unique<cudf::struct_scalar>(
-        cudf::host_span<cudf::column_view const>{columns}, is_valid);
+      cudf::host_span<cudf::column_view const>{columns}, is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv *env, jclass, jlong handle,
-                                                                jint repeat_times) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv* env,
+                                                                jclass,
+                                                                jlong handle,
+                                                                jint repeat_times)
+{
   JNI_NULL_CHECK(env, handle, "scalar handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const str = *reinterpret_cast<cudf::string_scalar *>(handle);
+    auto const str = *reinterpret_cast<cudf::string_scalar*>(handle);
     return reinterpret_cast<jlong>(cudf::strings::repeat_string(str, repeat_times).release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 51b8eb853de..e411b1d5362 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -13,10 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
+#include "csv_chunked_writer.hpp"
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_compiled_expr.hpp"
+#include "jni_utils.hpp"
+#include "jni_writer_data_sink.hpp"
 
-#include <arrow/io/api.h>
-#include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/concatenate.hpp>
@@ -44,16 +47,16 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
-#include "csv_chunked_writer.hpp"
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_compiled_expr.hpp"
-#include "jni_utils.hpp"
-#include "jni_writer_data_sink.hpp"
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+
+#include <algorithm>
 
 namespace cudf {
 namespace jni {
@@ -67,9 +70,11 @@ namespace jni {
  */
 struct jni_table_writer_handle_base {
   explicit jni_table_writer_handle_base(
-      std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : sink{std::move(sink_)}, stats{std::move(stats_)} {}
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : sink{std::move(sink_)}, stats{std::move(stats_)}
+  {
+  }
 
   std::unique_ptr<jni_writer_data_sink> sink;
   std::shared_ptr<cudf::io::writer_compression_statistics> stats;
@@ -77,13 +82,17 @@ struct jni_table_writer_handle_base {
 
 template <typename Writer>
 struct jni_table_writer_handle final : public jni_table_writer_handle_base {
-  explicit jni_table_writer_handle(std::unique_ptr<Writer> &&writer_)
-      : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)} {}
+  explicit jni_table_writer_handle(std::unique_ptr<Writer>&& writer_)
+    : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)}
+  {
+  }
   explicit jni_table_writer_handle(
-      std::unique_ptr<Writer> &&writer_, std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)),
-        writer{std::move(writer_)} {}
+    std::unique_ptr<Writer>&& writer_,
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)), writer{std::move(writer_)}
+  {
+  }
 
   std::unique_ptr<Writer> writer;
 };
@@ -92,16 +101,20 @@ typedef jni_table_writer_handle<cudf::io::parquet_chunked_writer> native_parquet
 typedef jni_table_writer_handle<cudf::io::orc_chunked_writer> native_orc_writer_handle;
 
 class native_arrow_ipc_writer_handle final {
-public:
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::string &file_name)
-      : initialized(false), column_names(col_names), file_name(file_name) {}
+ public:
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::string& file_name)
+    : initialized(false), column_names(col_names), file_name(file_name)
+  {
+  }
 
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::shared_ptr<arrow::io::OutputStream> &sink)
-      : initialized(false), column_names(col_names), file_name(""), sink(sink) {}
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::shared_ptr<arrow::io::OutputStream>& sink)
+    : initialized(false), column_names(col_names), file_name(""), sink(sink)
+  {
+  }
 
-private:
+ private:
   bool initialized;
   std::vector<std::string> column_names;
   std::vector<cudf::column_metadata> columns_meta;
@@ -109,23 +122,20 @@ class native_arrow_ipc_writer_handle final {
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
-public:
-  void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
+ public:
+  void write(std::shared_ptr<arrow::Table>& arrow_tab, int64_t max_chunk)
+  {
     if (!initialized) {
       if (!sink) {
         auto tmp_sink = arrow::io::FileOutputStream::Open(file_name);
-        if (!tmp_sink.ok()) {
-          throw std::runtime_error(tmp_sink.status().message());
-        }
+        if (!tmp_sink.ok()) { throw std::runtime_error(tmp_sink.status().message()); }
         sink = *tmp_sink;
       }
 
       // There is an option to have a file writer too, with metadata
       auto tmp_writer = arrow::ipc::MakeStreamWriter(sink, arrow_tab->schema());
-      if (!tmp_writer.ok()) {
-        throw std::runtime_error(tmp_writer.status().message());
-      }
-      writer = *tmp_writer;
+      if (!tmp_writer.ok()) { throw std::runtime_error(tmp_writer.status().message()); }
+      writer      = *tmp_writer;
       initialized = true;
     }
     if (arrow_tab->num_rows() == 0) {
@@ -133,7 +143,7 @@ class native_arrow_ipc_writer_handle final {
       // empty table, so need to write an empty batch explicitly.
       // For more please see https://issues.apache.org/jira/browse/ARROW-17912.
       auto empty_batch = arrow::RecordBatch::MakeEmpty(arrow_tab->schema());
-      auto status = writer->WriteRecordBatch(*(*empty_batch));
+      auto status      = writer->WriteRecordBatch(*(*empty_batch));
       if (!status.ok()) {
         throw std::runtime_error("writer failed to write batch with the following error: " +
                                  status.ToString());
@@ -147,7 +157,8 @@ class native_arrow_ipc_writer_handle final {
     }
   }
 
-  void close() {
+  void close()
+  {
     if (initialized) {
       {
         auto status = writer->Close();
@@ -167,7 +178,8 @@ class native_arrow_ipc_writer_handle final {
     initialized = false;
   }
 
-  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view &tview) {
+  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview)
+  {
     if (!column_names.empty() && columns_meta.empty()) {
       // Rebuild the structure of column meta according to table schema.
       // All the tables written by this writer should share the same schema,
@@ -187,13 +199,13 @@ class native_arrow_ipc_writer_handle final {
     return columns_meta;
   }
 
-private:
-  cudf::column_metadata build_one_column_meta(const cudf::column_view &cview, size_t &idx,
-                                              const bool consume_name = true) {
+ private:
+  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview,
+                                              size_t& idx,
+                                              const bool consume_name = true)
+  {
     auto col_meta = cudf::column_metadata{};
-    if (consume_name) {
-      col_meta.name = get_column_name(idx++);
-    }
+    if (consume_name) { col_meta.name = get_column_name(idx++); }
     // Process children
     if (cview.type().id() == cudf::type_id::LIST) {
       // list type:
@@ -213,7 +225,8 @@ class native_arrow_ipc_writer_handle final {
     return col_meta;
   }
 
-  std::string &get_column_name(const size_t idx) {
+  std::string& get_column_name(const size_t idx)
+  {
     if (idx < 0 || idx >= column_names.size()) {
       throw cudf::jni::jni_exception("Missing names for columns or nested struct columns");
     }
@@ -222,49 +235,47 @@ class native_arrow_ipc_writer_handle final {
 };
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
-public:
-  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_output_stream(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
-    this->callback = add_global_ref(env, callback);
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_arrow_output_stream() {
+  virtual ~jni_arrow_output_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
+  arrow::Status Write(const std::shared_ptr<arrow::Buffer>& data) override
+  {
     return Write(data->data(), data->size());
   }
 
-  arrow::Status Write(const void *data, int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    int64_t left_to_copy = nbytes;
-    const char *copy_from = static_cast<const char *>(data);
+  arrow::Status Write(const void* data, int64_t nbytes) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    int64_t left_to_copy  = nbytes;
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -273,8 +284,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -285,25 +296,28 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     return arrow::Status::OK();
   }
 
-  arrow::Status Flush() override {
+  arrow::Status Flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
     return arrow::Status::OK();
   }
 
-  arrow::Status Close() override {
-    auto ret = Flush();
+  arrow::Status Close() override
+  {
+    auto ret  = Flush();
     is_closed = true;
     return ret;
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -312,99 +326,93 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  int64_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
-  bool is_closed = false;
+  int64_t total_written       = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
+  bool is_closed              = false;
   jobject host_memory_allocator;
 };
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
-public:
-  explicit jni_arrow_input_stream(JNIEnv *env, jobject callback)
-      : mm(arrow::default_cpu_memory_manager()) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_input_stream(JNIEnv* env, jobject callback)
+    : mm(arrow::default_cpu_memory_manager())
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     read_into_method = env->GetMethodID(cls, "readInto", "(JJ)J");
-    if (read_into_method == nullptr) {
-      throw cudf::jni::jni_exception("readInto method");
-    }
+    if (read_into_method == nullptr) { throw cudf::jni::jni_exception("readInto method"); }
 
     this->callback = add_global_ref(env, callback);
   }
 
-  virtual ~jni_arrow_input_stream() {
+  virtual ~jni_arrow_input_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       callback = del_global_ref(env, callback);
     }
     callback = nullptr;
   }
 
-  arrow::Result<int64_t> Read(int64_t nbytes, void *out) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    jlong ret = read_into(env, ptr_as_jlong(out), nbytes);
+  arrow::Result<int64_t> Read(int64_t nbytes, void* out) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
+    jlong ret   = read_into(env, ptr_as_jlong(out), nbytes);
     total_read += ret;
     return ret;
   }
 
-  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     arrow::Result<std::shared_ptr<arrow::ResizableBuffer>> tmp_buffer =
-        arrow::AllocateResizableBuffer(nbytes);
-    if (!tmp_buffer.ok()) {
-      return tmp_buffer;
-    }
-    jlong amount_read = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
+      arrow::AllocateResizableBuffer(nbytes);
+    if (!tmp_buffer.ok()) { return tmp_buffer; }
+    jlong amount_read  = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
     arrow::Status stat = (*tmp_buffer)->Resize(amount_read);
-    if (!stat.ok()) {
-      return stat;
-    }
+    if (!stat.ok()) { return stat; }
     return tmp_buffer;
   }
 
-  arrow::Status Close() override {
+  arrow::Status Close() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -413,57 +421,51 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  jlong read_into(JNIEnv *env, jlong addr, jlong len) {
+ private:
+  jlong read_into(JNIEnv* env, jlong addr, jlong len)
+  {
     jlong ret = env->CallLongMethod(callback, read_into_method, addr, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("readInto threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("readInto threw an exception"); }
     return ret;
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID read_into_method;
   int64_t total_read = 0;
-  bool is_closed = false;
+  bool is_closed     = false;
   std::vector<uint8_t> tmp_buffer;
   std::shared_ptr<arrow::MemoryManager> mm;
 };
 
 class native_arrow_ipc_reader_handle final {
-public:
-  explicit native_arrow_ipc_reader_handle(const std::string &file_name) {
+ public:
+  explicit native_arrow_ipc_reader_handle(const std::string& file_name)
+  {
     auto tmp_source = arrow::io::ReadableFile::Open(file_name);
-    if (!tmp_source.ok()) {
-      throw std::runtime_error(tmp_source.status().message());
-    }
-    source = *tmp_source;
+    if (!tmp_source.ok()) { throw std::runtime_error(tmp_source.status().message()); }
+    source          = *tmp_source;
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
   explicit native_arrow_ipc_reader_handle(std::shared_ptr<arrow::io::InputStream> source)
-      : source(source) {
+    : source(source)
+  {
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
-  std::shared_ptr<arrow::Table> next(int32_t row_target) {
+  std::shared_ptr<arrow::Table> next(int32_t row_target)
+  {
     int64_t total_rows = 0;
-    bool done = false;
+    bool done          = false;
     std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
     while (!done) {
       arrow::Result<std::shared_ptr<arrow::RecordBatch>> batch = reader->Next();
-      if (!batch.ok()) {
-        throw std::runtime_error(batch.status().message());
-      }
+      if (!batch.ok()) { throw std::runtime_error(batch.status().message()); }
       if (!*batch) {
         done = true;
       } else {
@@ -477,17 +479,16 @@ class native_arrow_ipc_reader_handle final {
       return std::unique_ptr<arrow::Table>();
     }
     arrow::Result<std::shared_ptr<arrow::Table>> tmp =
-        arrow::Table::FromRecordBatches(reader->schema(), batches);
-    if (!tmp.ok()) {
-      throw std::runtime_error(tmp.status().message());
-    }
+      arrow::Table::FromRecordBatches(reader->schema(), batches);
+    if (!tmp.ok()) { throw std::runtime_error(tmp.status().message()); }
     return *tmp;
   }
 
   std::shared_ptr<arrow::io::InputStream> source;
   std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
 
-  void close() {
+  void close()
+  {
     auto status = source->Close();
     if (!status.ok()) {
       throw std::runtime_error("Closing source failed with the following error: " +
@@ -496,33 +497,44 @@ class native_arrow_ipc_reader_handle final {
   }
 };
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>&& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
-  int table_cols = ret.size();
-  int num_columns = table_cols + extra_columns.size();
+  int table_cols                                 = ret.size();
+  int num_columns                                = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  std::transform(ret.begin(), ret.end(), outcol_handles.begin(),
-                 [](auto &col) { return release_as_jlong(col); });
-  std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols,
-                 [](auto &col) { return release_as_jlong(col); });
+  std::transform(ret.begin(), ret.end(), outcol_handles.begin(), [](auto& col) {
+    return release_as_jlong(col);
+  });
+  std::transform(
+    extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols, [](auto& col) {
+      return release_as_jlong(col);
+    });
   return outcol_handles.get_jArray();
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   return convert_table_for_return(env, std::move(table_result), std::move(extra_columns));
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &first_table,
-                                    std::unique_ptr<cudf::table> &second_table) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& first_table,
+                                    std::unique_ptr<cudf::table>& second_table)
+{
   return convert_table_for_return(env, first_table, second_table->release());
 }
 
 // Convert the JNI boolean array of key column sort order to a vector of cudf::order
 // for groupby.
-std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_sort_desc,
-                                              int key_size) {
+std::vector<cudf::order> resolve_column_order(JNIEnv* env,
+                                              jbooleanArray jkeys_sort_desc,
+                                              int key_size)
+{
   cudf::jni::native_jbooleanArray keys_sort_desc(env, jkeys_sort_desc);
   auto keys_sort_num = keys_sort_desc.size();
   // The number of column order should be 0 or equal to the number of key.
@@ -532,18 +544,21 @@ std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_s
 
   std::vector<cudf::order> column_order(keys_sort_num);
   if (keys_sort_num > 0) {
-    std::transform(keys_sort_desc.data(), keys_sort_desc.data() + keys_sort_num,
-                   column_order.begin(), [](jboolean is_desc) {
-                     return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-                   });
+    std::transform(
+      keys_sort_desc.data(),
+      keys_sort_desc.data() + keys_sort_num,
+      column_order.begin(),
+      [](jboolean is_desc) { return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING; });
   }
   return column_order;
 }
 
 // Convert the JNI boolean array of key column null order to a vector of cudf::null_order
 // for groupby.
-std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray jkeys_null_first,
-                                                      int key_size) {
+std::vector<cudf::null_order> resolve_null_precedence(JNIEnv* env,
+                                                      jbooleanArray jkeys_null_first,
+                                                      int key_size)
+{
   cudf::jni::native_jbooleanArray keys_null_first(env, jkeys_null_first);
   auto null_order_num = keys_null_first.size();
   // The number of null order should be 0 or equal to the number of key.
@@ -553,8 +568,10 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
   std::vector<cudf::null_order> null_precedence(null_order_num);
   if (null_order_num > 0) {
-    std::transform(keys_null_first.data(), keys_null_first.data() + null_order_num,
-                   null_precedence.begin(), [](jboolean null_before) {
+    std::transform(keys_null_first.data(),
+                   keys_null_first.data() + null_order_num,
+                   null_precedence.begin(),
+                   [](jboolean null_before) {
                      return null_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
                    });
   }
@@ -563,49 +580,63 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
 namespace {
 
-int set_column_metadata(
-    cudf::io::column_in_metadata &column_metadata, std::vector<std::string> &col_names,
-    cudf::jni::native_jbooleanArray &nullability, cudf::jni::native_jbooleanArray &is_int96,
-    cudf::jni::native_jintArray &precisions, cudf::jni::native_jbooleanArray &is_map,
-    cudf::jni::native_jbooleanArray &hasParquetFieldIds,
-    cudf::jni::native_jintArray &parquetFieldIds, cudf::jni::native_jintArray &children,
-    int num_children, int read_index, cudf::jni::native_jbooleanArray &is_binary) {
+int set_column_metadata(cudf::io::column_in_metadata& column_metadata,
+                        std::vector<std::string>& col_names,
+                        cudf::jni::native_jbooleanArray& nullability,
+                        cudf::jni::native_jbooleanArray& is_int96,
+                        cudf::jni::native_jintArray& precisions,
+                        cudf::jni::native_jbooleanArray& is_map,
+                        cudf::jni::native_jbooleanArray& hasParquetFieldIds,
+                        cudf::jni::native_jintArray& parquetFieldIds,
+                        cudf::jni::native_jintArray& children,
+                        int num_children,
+                        int read_index,
+                        cudf::jni::native_jbooleanArray& is_binary)
+{
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
     cudf::io::column_in_metadata child;
     child.set_name(col_names[read_index]).set_nullability(nullability[read_index]);
-    if (precisions[read_index] > -1) {
-      child.set_decimal_precision(precisions[read_index]);
-    }
-    if (!is_int96.is_null()) {
-      child.set_int96_timestamps(is_int96[read_index]);
-    }
-    if (!is_binary.is_null()) {
-      child.set_output_as_binary(is_binary[read_index]);
-    }
-    if (is_map[read_index]) {
-      child.set_list_column_as_map();
-    }
+    if (precisions[read_index] > -1) { child.set_decimal_precision(precisions[read_index]); }
+    if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); }
+    if (!is_binary.is_null()) { child.set_output_as_binary(is_binary[read_index]); }
+    if (is_map[read_index]) { child.set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       child.set_parquet_field_id(parquetFieldIds[read_index]);
     }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index = set_column_metadata(
-          column_metadata.child(write_index), col_names, nullability, is_int96, precisions, is_map,
-          hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(column_metadata.child(write_index),
+                                       col_names,
+                                       nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
   return read_index;
 }
 
-void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
-                         jintArray &j_children, jbooleanArray &j_col_nullability,
-                         jbooleanArray &j_is_int96, jintArray &j_precisions,
-                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata,
-                         jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds,
-                         jbooleanArray &j_is_binary) {
+void createTableMetaData(JNIEnv* env,
+                         jint num_children,
+                         jobjectArray& j_col_names,
+                         jintArray& j_children,
+                         jbooleanArray& j_col_nullability,
+                         jbooleanArray& j_is_int96,
+                         jintArray& j_precisions,
+                         jbooleanArray& j_is_map,
+                         cudf::io::table_input_metadata& metadata,
+                         jbooleanArray& j_hasParquetFieldIds,
+                         jintArray& j_parquetFieldIds,
+                         jbooleanArray& j_is_binary)
+{
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
@@ -622,11 +653,11 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   int top_level_children = num_children;
 
   metadata.column_metadata.resize(top_level_children);
-  int read_index = 0; // the read_index, which will be used to read the arrays
+  int read_index = 0;  // the read_index, which will be used to read the arrays
   for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
     metadata.column_metadata[write_index]
-        .set_name(cpp_names[read_index])
-        .set_nullability(col_nullability[read_index]);
+      .set_name(cpp_names[read_index])
+      .set_nullability(col_nullability[read_index]);
     if (precisions[read_index] > -1) {
       metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]);
     }
@@ -636,37 +667,46 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (!is_binary.is_null()) {
       metadata.column_metadata[write_index].set_output_as_binary(is_binary[read_index]);
     }
-    if (is_map[read_index]) {
-      metadata.column_metadata[write_index].set_list_column_as_map();
-    }
+    if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]);
     }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              is_int96, precisions, is_map, hasParquetFieldIds, parquetFieldIds,
-                              children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(metadata.column_metadata[write_index],
+                                       cpp_names,
+                                       col_nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods, native_jintArray const &preceding,
-                             native_jintArray const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jintArray const& preceding,
+                             native_jintArray const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods,
-                             native_jpointerArray<cudf::scalar> const &preceding,
-                             native_jpointerArray<cudf::scalar> const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jpointerArray<cudf::scalar> const& preceding,
+                             native_jpointerArray<cudf::scalar> const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
@@ -678,12 +718,13 @@ bool valid_window_parameters(native_jintArray const &values,
 //   2: Host address of the rmm::device_buffer instance that owns the left gather map data
 //   3: Device address of the gather map for the right table
 //   4: Host address of the rmm::device_buffer instance that owns the right gather map data
-jlongArray gather_maps_to_java(JNIEnv *env,
-                               std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                                         std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-                                   maps) {
+jlongArray gather_maps_to_java(
+  JNIEnv* env,
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>> maps)
+{
   // release the underlying device buffer to Java
-  auto left_map_buffer = std::make_unique<rmm::device_buffer>(maps.first->release());
+  auto left_map_buffer  = std::make_unique<rmm::device_buffer>(maps.first->release());
   auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
   cudf::jni::native_jlongArray result(env, 5);
   result[0] = static_cast<jlong>(left_map_buffer->size());
@@ -699,27 +740,29 @@ jlongArray gather_maps_to_java(JNIEnv *env,
 //   0: Size of the gather map in bytes
 //   1: Device address of the gather map
 //   2: Host address of the rmm::device_buffer instance that owns the gather map data
-jlongArray gather_map_to_java(JNIEnv *env,
-                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map) {
+jlongArray gather_map_to_java(JNIEnv* env,
+                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map)
+{
   // release the underlying device buffer to Java
   cudf::jni::native_jlongArray result(env, 3);
-  result[0] = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
+  result[0]              = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
   auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
-  result[1] = ptr_as_jlong(gather_map_buffer->data());
-  result[2] = release_as_jlong(gather_map_buffer);
+  result[1]              = ptr_as_jlong(gather_map_buffer->data());
+  result[2]              = release_as_jlong(gather_map_buffer);
   return result.get_jArray();
 }
 
 // Generate gather maps needed to manifest the result of an equi-join between two tables.
 template <typename T>
-jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                            jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_maps(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -729,14 +772,17 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
 // Generate gather maps needed to manifest the result of an equi-join between a left table and
 // a hash table built from the join's right table.
 template <typename T>
-jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
-                                 T join_func) {
+jlongArray hash_join_gather_maps(JNIEnv* env,
+                                 jlong j_left_keys,
+                                 jlong j_right_hash_join,
+                                 T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
   JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto left_keys = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto hash_join = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
     return gather_maps_to_java(env, join_func(*left_keys, *hash_join));
   }
   CATCH_STD(env, NULL);
@@ -744,32 +790,34 @@ jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_h
 
 // Generate gather maps needed to manifest the result of a conditional join between two tables.
 template <typename T>
-jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                 jlong j_condition, T join_func) {
+jlongArray cond_join_gather_maps(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     return gather_maps_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a semi/anti join between two tables.
 template <typename T>
-jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_single_map(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -779,26 +827,33 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_
 // Generate a gather map needed to manifest the result of a conditional semi/anti join
 // between two tables.
 template <typename T>
-jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                       jlong j_condition, T join_func) {
+jlongArray cond_join_gather_single_map(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr*>(j_condition);
     return gather_map_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                           jlong j_left_condition, jlong j_right_condition, jlong j_condition,
-                           jboolean j_nulls_equal, T join_size_func) {
+jlongArray mixed_join_size(JNIEnv* env,
+                           jlong j_left_keys,
+                           jlong j_right_keys,
+                           jlong j_left_condition,
+                           jlong j_right_condition,
+                           jlong j_condition,
+                           jboolean j_nulls_equal,
+                           T join_size_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -806,16 +861,19 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto [join_size, matches_per_row] =
-        join_size_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                       condition->get_top_expression(), nulls_equal);
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto [join_size, matches_per_row] = join_size_func(*left_keys,
+                                                       *right_keys,
+                                                       *left_condition,
+                                                       *right_condition,
+                                                       condition->get_top_expression(),
+                                                       nulls_equal);
     if (matches_per_row->size() > std::numeric_limits<cudf::size_type>::max()) {
       throw std::runtime_error("Too many values in device buffer to convert into a column");
     }
@@ -823,17 +881,26 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto col_data = matches_per_row->release();
     cudf::jni::native_jlongArray result(env, 2);
     result[0] = static_cast<jlong>(join_size);
-    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32}, col_size,
-                                              std::move(col_data), rmm::device_buffer{}, 0});
+    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32},
+                                              col_size,
+                                              std::move(col_data),
+                                              rmm::device_buffer{},
+                                              0});
     return result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jlong j_left_condition, jlong j_right_condition,
-                                  jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_maps(JNIEnv* env,
+                                  jlong j_left_keys,
+                                  jlong j_right_keys,
+                                  jlong j_left_condition,
+                                  jlong j_right_condition,
+                                  jlong j_condition,
+                                  jboolean j_nulls_equal,
+                                  T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -841,24 +908,34 @@ jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env,
-                               join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                         condition->get_top_expression(), nulls_equal));
+                               join_func(*left_keys,
+                                         *right_keys,
+                                         *left_condition,
+                                         *right_condition,
+                                         condition->get_top_expression(),
+                                         nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                        jlong j_left_condition, jlong j_right_condition,
-                                        jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_single_map(JNIEnv* env,
+                                        jlong j_left_keys,
+                                        jlong j_right_keys,
+                                        jlong j_left_condition,
+                                        jlong j_right_condition,
+                                        jlong j_condition,
+                                        jboolean j_nulls_equal,
+                                        T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -866,35 +943,46 @@ jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env,
-                              join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                        condition->get_top_expression(), nulls_equal));
+                              join_func(*left_keys,
+                                        *right_keys,
+                                        *left_condition,
+                                        *right_condition,
+                                        condition->get_top_expression(),
+                                        nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
-std::pair<std::size_t, cudf::device_span<cudf::size_type const>>
-get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view) {
+std::pair<std::size_t, cudf::device_span<cudf::size_type const>> get_mixed_size_info(
+  JNIEnv* env, jlong j_output_row_count, jlong j_matches_view)
+{
   auto const row_count = static_cast<std::size_t>(j_output_row_count);
-  auto const matches = reinterpret_cast<cudf::column_view const *>(j_matches_view);
-  return std::make_pair(row_count, cudf::device_span<cudf::size_type const>(
-                                       matches->template data<cudf::size_type>(), matches->size()));
+  auto const matches   = reinterpret_cast<cudf::column_view const*>(j_matches_view);
+  return std::make_pair(row_count,
+                        cudf::device_span<cudf::size_type const>(
+                          matches->template data<cudf::size_type>(), matches->size()));
 }
 
-cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+cudf::column_view remove_validity_from_col(cudf::column_view column_view)
+{
   if (!cudf::is_compound(column_view.type())) {
     if (column_view.nullable() && column_view.null_count() == 0) {
       // null_mask is allocated but no nulls present therefore we create a new column_view without
       // the null_mask to avoid things blowing up in reading the parquet file
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset());
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset());
     } else {
       return cudf::column_view(column_view);
     }
@@ -905,17 +993,27 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
       children.push_back(remove_validity_from_col(*it));
     }
     if (!column_view.nullable() || column_view.null_count() != 0) {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(),
-                               column_view.null_mask(), column_view.null_count(),
-                               column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               column_view.null_mask(),
+                               column_view.null_count(),
+                               column_view.offset(),
+                               children);
     } else {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset(),
+                               children);
     }
   }
 }
 
-cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+cudf::table_view remove_validity_if_needed(cudf::table_view* input_table_view)
+{
   std::vector<cudf::column_view> views;
   views.reserve(input_table_view->num_columns());
   for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
@@ -925,11 +1023,12 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
   return cudf::table_view(views);
 }
 
-cudf::io::schema_element read_schema_element(int &index,
-                                             cudf::jni::native_jintArray const &children,
-                                             cudf::jni::native_jstringArray const &names,
-                                             cudf::jni::native_jintArray const &types,
-                                             cudf::jni::native_jintArray const &scales) {
+cudf::io::schema_element read_schema_element(int& index,
+                                             cudf::jni::native_jintArray const& children,
+                                             cudf::jni::native_jstringArray const& names,
+                                             cudf::jni::native_jintArray const& types,
+                                             cudf::jni::native_jintArray const& scales)
+{
   auto d_type = cudf::data_type{static_cast<cudf::type_id>(types[index]), scales[index]};
   if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) {
     std::map<std::string, cudf::io::schema_element> child_elems;
@@ -938,8 +1037,8 @@ cudf::io::schema_element read_schema_element(int &index,
     index++;
     for (int i = 0; i < num_children; i++) {
       child_elems.insert(
-          std::pair{names.get(index).get(),
-                    cudf::jni::read_schema_element(index, children, names, types, scales)});
+        std::pair{names.get(index).get(),
+                  cudf::jni::read_schema_element(index, children, names, types, scales)});
     }
     return cudf::io::schema_element{d_type, std::move(child_elems)};
   } else {
@@ -952,26 +1051,27 @@ cudf::io::schema_element read_schema_element(int &index,
   }
 }
 
-void append_flattened_child_counts(cudf::io::column_name_info const &info,
-                                   std::vector<int> &counts) {
+void append_flattened_child_counts(cudf::io::column_name_info const& info, std::vector<int>& counts)
+{
   counts.push_back(info.children.size());
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_counts(child, counts);
   }
 }
 
-void append_flattened_child_names(cudf::io::column_name_info const &info,
-                                  std::vector<std::string> &names) {
+void append_flattened_child_names(cudf::io::column_name_info const& info,
+                                  std::vector<std::string>& names)
+{
   names.push_back(info.name);
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_names(child, names);
   }
 }
 
-} // namespace
+}  // namespace
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 using cudf::jni::convert_table_for_return;
 using cudf::jni::ptr_as_jlong;
@@ -980,24 +1080,28 @@ using cudf::jni::release_as_jlong;
 extern "C" {
 
 // This is a method purely added for testing remove_validity_if_needed method
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
-                                                                               jlong j_table_view) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong j_table_view)
+{
   JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
   try {
-    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table_view);
     cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
     cudf::table m_tbl(result);
     std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
     auto results = cudf::jni::native_jlongArray(env, cols.size());
-    std::transform(cols.begin(), cols.end(), results.begin(),
-                   [](auto &col) { return release_as_jlong(col); });
+    std::transform(
+      cols.begin(), cols.end(), results.begin(), [](auto& col) { return release_as_jlong(col); });
     return results.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
-                                                                      jlongArray j_cudf_columns) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlongArray j_cudf_columns)
+{
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
 
   try {
@@ -1010,27 +1114,31 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv *env, jclass,
-                                                                 jlong j_cudf_table_view) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table view handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    delete reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv* env,
+                                                                             jclass,
                                                                              jobject buffer_obj,
-                                                                             jlong j_data_address) {
+                                                                             jlong j_data_address)
+{
   // The GPU data address can be null when the table is empty, so it is not null-checked here.
   JNI_NULL_CHECK(env, buffer_obj, "metadata is null", nullptr);
   try {
     cudf::jni::auto_set_device(env);
-    void const *metadata_address = env->GetDirectBufferAddress(buffer_obj);
+    void const* metadata_address = env->GetDirectBufferAddress(buffer_obj);
     JNI_NULL_CHECK(env, metadata_address, "metadata buffer address is null", nullptr);
-    cudf::table_view table = cudf::unpack(static_cast<uint8_t const *>(metadata_address),
-                                          reinterpret_cast<uint8_t const *>(j_data_address));
+    cudf::table_view table = cudf::unpack(static_cast<uint8_t const*>(metadata_address),
+                                          reinterpret_cast<uint8_t const*>(j_data_address));
     cudf::jni::native_jlongArray views(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       // TODO Exception handling is not ideal, if no exceptions are thrown ownership of the new cv
@@ -1051,12 +1159,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNI
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv* env,
+                                                            jclass,
                                                             jlong j_input_table,
                                                             jlongArray j_sort_keys_columns,
                                                             jbooleanArray j_is_descending,
-                                                            jbooleanArray j_are_nulls_smallest) {
-
+                                                            jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", 0);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", 0);
@@ -1071,19 +1180,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and is_descending lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     return release_as_jlong(cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order));
@@ -1091,12 +1202,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv* env,
+                                                               jclass,
                                                                jlong j_input_table,
                                                                jlongArray j_sort_keys_columns,
                                                                jbooleanArray j_is_descending,
-                                                               jbooleanArray j_are_nulls_smallest) {
-
+                                                               jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", NULL);
@@ -1111,36 +1223,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
 
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order);
 
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_input_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_input_table);
     return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view()));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv* env,
+                                                             jclass,
                                                              jlongArray j_table_handles,
                                                              jintArray j_sort_key_indexes,
                                                              jbooleanArray j_is_descending,
-                                                             jbooleanArray j_are_nulls_smallest) {
-
+                                                             jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_table_handles, "input tables are null", NULL);
   JNI_NULL_CHECK(env, j_sort_key_indexes, "key indexes is null", NULL);
@@ -1156,20 +1271,24 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_is_desc == num_columns,
+                  "columns and is_descending lengths don't match",
+                  NULL);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  NULL);
 
     std::vector<int> indexes = n_sort_key_indexes.to_vector<int>();
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
     std::vector<cudf::table_view> tables = n_table_handles.get_dereferenced();
 
     return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order));
@@ -1177,11 +1296,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
-    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
-    jlong ds_handle) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readCSVFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray col_names,
+                                                jintArray j_types,
+                                                jintArray j_scales,
+                                                jobjectArray filter_col_names,
+                                                jint header_row,
+                                                jbyte delim,
+                                                jint j_quote_style,
+                                                jbyte quote,
+                                                jbyte comment,
+                                                jobjectArray null_values,
+                                                jobjectArray true_values,
+                                                jobjectArray false_values,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
 
@@ -1199,8 +1330,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1210,37 +1344,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jint header_row, jbyte delim, jint j_quote_style, jbyte quote, jbyte comment,
-    jobjectArray null_values, jobjectArray true_values, jobjectArray false_values) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray col_names,
+                                                               jintArray j_types,
+                                                               jintArray j_scales,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jint header_row,
+                                                               jbyte delim,
+                                                               jint j_quote_style,
+                                                               jbyte quote,
+                                                               jbyte comment,
+                                                               jobjectArray null_values,
+                                                               jobjectArray true_values,
+                                                               jobjectArray false_values)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
 
   bool read_buffer = true;
@@ -1248,8 +1395,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1268,8 +1415,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1284,36 +1434,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source            = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                                         : cudf::io::source_info{filename.get()};
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
-    JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jstring j_output_path) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_table_handle,
+                                                                jobjectArray j_column_names,
+                                                                jboolean include_header,
+                                                                jstring j_row_delimiter,
+                                                                jbyte j_field_delimiter,
+                                                                jstring j_null_value,
+                                                                jstring j_true_value,
+                                                                jstring j_false_value,
+                                                                jint j_quote_style,
+                                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", );
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", );
@@ -1327,37 +1486,47 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
     cudf::jni::auto_set_device(env);
 
     auto const native_output_path = cudf::jni::native_jstring{env, j_output_path};
-    auto const output_path = native_output_path.get();
+    auto const output_path        = native_output_path.get();
 
-    auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
+    auto const table          = reinterpret_cast<cudf::table_view*>(j_table_handle);
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table)
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style);
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style);
 
     cudf::io::write_csv(options.build());
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
-    JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jobject j_buffer, jobject host_memory_allocator) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_column_names,
+                                                jboolean include_header,
+                                                jstring j_row_delimiter,
+                                                jbyte j_field_delimiter,
+                                                jstring j_null_value,
+                                                jstring j_true_value,
+                                                jstring j_false_value,
+                                                jint j_quote_style,
+                                                jobject j_buffer,
+                                                jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0);
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0);
   JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0);
@@ -1368,42 +1537,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
     cudf::jni::auto_set_device(env);
 
     auto data_sink =
-        std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
+      std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
 
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()},
                                                          cudf::table_view{})
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style)
-                       .build();
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style)
+                     .build();
 
     return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong j_writer_handle,
-                                                                       jlong j_table_handle) {
+                                                                       jlong j_table_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
 
-  auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
-  auto writer = reinterpret_cast<cudf::jni::io::csv_chunked_writer *>(j_writer_handle);
+  auto const table = reinterpret_cast<cudf::table_view*>(j_table_handle);
+  auto writer      = reinterpret_cast<cudf::jni::io::csv_chunked_writer*>(j_writer_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1412,13 +1583,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *e
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env, jclass,
-                                                                     jlong j_writer_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong j_writer_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
 
   using cudf::jni::io::csv_chunked_writer;
   auto writer =
-      std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer *>(j_writer_handle)};
+    std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer*>(j_writer_handle)};
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1427,44 +1600,57 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
-    JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
+                                                         jclass,
+                                                         jboolean day_first,
+                                                         jboolean lines,
+                                                         jboolean recover_with_null,
+                                                         jboolean normalize_single_quotes,
+                                                         jboolean normalize_whitespace,
+                                                         jboolean mixed_types_as_string,
+                                                         jboolean keep_quotes,
+                                                         jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
-    JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
-    jboolean mixed_types_as_string, jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jboolean day_first,
+                                                                   jboolean lines,
+                                                                   jboolean recover_with_null,
+                                                                   jboolean normalize_single_quotes,
+                                                                   jboolean normalize_whitespace,
+                                                                   jboolean mixed_types_as_string,
+                                                                   jboolean keep_quotes)
+{
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
@@ -1473,50 +1659,52 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
   try {
     cudf::jni::auto_set_device(env);
 
-    auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
+    auto source = cudf::io::source_info{reinterpret_cast<char*>(buffer),
                                         static_cast<std::size_t>(buffer_length)};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    delete reinterpret_cast<cudf::io::table_with_metadata*>(handle);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jintArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT jintArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv* env,
+                                                                                      jclass,
+                                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<int> counts;
     counts.push_back(ptr->metadata.schema_info.size());
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_counts(child, counts);
     }
 
@@ -1532,21 +1720,22 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, j
 }
 
 JNIEXPORT jobjectArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, jlong handle) {
+Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<std::string> names;
     names.push_back("ROOT");
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_names(child, names);
     }
 
     auto length = names.size();
-    auto ret = static_cast<jobjectArray>(
-        env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
+    auto ret    = static_cast<jobjectArray>(
+      env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
     for (size_t i = 0; i < length; i++) {
       env->SetObjectArrayElement(ret, i, env->NewStringUTF(names[i].c_str()));
     }
@@ -1556,13 +1745,15 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, j
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     if (ptr->tbl) {
       return convert_table_for_return(env, ptr->tbl);
     } else {
@@ -1572,12 +1763,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
+                                                 jclass,
+                                                 jintArray j_num_children,
+                                                 jobjectArray col_names,
+                                                 jintArray j_types,
+                                                 jintArray j_scales,
+                                                 jboolean day_first,
+                                                 jboolean lines,
+                                                 jboolean recover_with_null,
+                                                 jboolean normalize_single_quotes,
+                                                 jboolean normalize_whitespace,
+                                                 jboolean mixed_types_as_string,
+                                                 jboolean keep_quotes,
+                                                 jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1596,41 +1797,41 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match null", 0);
     }
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1638,27 +1839,37 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
+                                                           jclass,
+                                                           jintArray j_num_children,
+                                                           jobjectArray col_names,
+                                                           jintArray j_types,
+                                                           jintArray j_scales,
+                                                           jstring inputfilepath,
+                                                           jlong buffer,
+                                                           jlong buffer_length,
+                                                           jboolean day_first,
+                                                           jboolean lines,
+                                                           jboolean recover_with_null,
+                                                           jboolean normalize_single_quotes,
+                                                           jboolean normalize_whitespace,
+                                                           jboolean mixed_types_as_string,
+                                                           jboolean keep_quotes)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", 0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", 0);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
@@ -1684,42 +1895,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inputfilepath can't be empty", 0);
     }
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                              : cudf::io::source_info{filename.get()};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1727,17 +1938,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
-    jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readParquetFromDataSource(JNIEnv* env,
+                                                    jclass,
+                                                    jobjectArray filter_col_names,
+                                                    jbooleanArray j_col_binary_read,
+                                                    jint unit,
+                                                    jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
 
@@ -1747,7 +1962,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
@@ -1756,26 +1971,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
-    jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray filter_col_names,
+                                                                   jbooleanArray j_col_binary_read,
+                                                                   jstring inputfilepath,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1790,9 +2010,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -1800,17 +2020,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
-
+  JNIEnv* env, jclass, jobjectArray filter_col_names, jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1818,28 +2038,30 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv* env,
+                                                                jclass,
                                                                 jobjectArray filter_col_names,
-                                                                jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length) {
-
+                                                                jstring inputfilepath,
+                                                                jlong buffer,
+                                                                jlong buffer_length)
+{
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1853,24 +2075,38 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jcl
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jobject consumer, jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
+                                                  jclass,
+                                                  jobjectArray j_col_names,
+                                                  jint j_num_children,
+                                                  jintArray j_children,
+                                                  jbooleanArray j_col_nullability,
+                                                  jobjectArray j_metadata_keys,
+                                                  jobjectArray j_metadata_values,
+                                                  jint j_compression,
+                                                  jint j_stats_freq,
+                                                  jbooleanArray j_isInt96,
+                                                  jintArray j_precisions,
+                                                  jbooleanArray j_is_map,
+                                                  jbooleanArray j_is_binary,
+                                                  jbooleanArray j_hasParquetFieldIds,
+                                                  jintArray j_parquetFieldIds,
+                                                  jobject consumer,
+                                                  jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1878,23 +2114,34 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
 
     using namespace cudf::io;
     using namespace cudf::jni;
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1902,27 +2149,40 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
 
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret = new cudf::jni::native_parquet_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jstring j_output_path) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_col_names,
+                                                jint j_num_children,
+                                                jintArray j_children,
+                                                jbooleanArray j_col_nullability,
+                                                jobjectArray j_metadata_keys,
+                                                jobjectArray j_metadata_values,
+                                                jint j_compression,
+                                                jint j_stats_freq,
+                                                jbooleanArray j_isInt96,
+                                                jintArray j_precisions,
+                                                jbooleanArray j_is_map,
+                                                jbooleanArray j_is_binary,
+                                                jbooleanArray j_hasParquetFieldIds,
+                                                jintArray j_parquetFieldIds,
+                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1934,17 +2194,28 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::io;
     using namespace cudf::jni;
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1953,33 +2224,33 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     sink_info sink{output_path.get()};
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret =
+      new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, jclass,
-                                                                   jlong j_state, jlong j_table,
-                                                                   jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view* tview_with_empty_nullmask = reinterpret_cast<cudf::table_view*>(j_table);
   cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -1992,13 +2263,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_parquet_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2007,10 +2278,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
-    jobjectArray dec128_col_names, jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readORCFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray filter_col_names,
+                                                jboolean usingNumPyTypes,
+                                                jint unit,
+                                                jobjectArray dec128_col_names,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -2020,7 +2296,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::orc_reader_options::builder(source);
@@ -2029,26 +2305,33 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jboolean usingNumPyTypes,
+                                                               jint unit,
+                                                               jobjectArray dec128_col_names)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -2064,9 +2347,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto source = read_buffer ?
-                      cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
-                      cudf::io::source_info(filename.get());
+    auto source = read_buffer
+                    ? cudf::io::source_info(reinterpret_cast<char*>(buffer), buffer_length)
+                    : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::orc_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2074,21 +2357,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer,
-    jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
+                                              jclass,
+                                              jobjectArray j_col_names,
+                                              jint j_num_children,
+                                              jintArray j_children,
+                                              jbooleanArray j_col_nullability,
+                                              jobjectArray j_metadata_keys,
+                                              jobjectArray j_metadata_values,
+                                              jint j_compression,
+                                              jintArray j_precisions,
+                                              jbooleanArray j_is_map,
+                                              jobject consumer,
+                                              jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2103,46 +2396,66 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
 
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
     sink_info sink{data_sink.get()};
 
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
-    auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
+    auto writer_ptr                          = std::make_unique<cudf::io::orc_chunked_writer>(opts);
+    cudf::jni::native_orc_writer_handle* ret = new cudf::jni::native_orc_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jstring j_output_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray j_col_names,
+                                                                   jint j_num_children,
+                                                                   jintArray j_children,
+                                                                   jbooleanArray j_col_nullability,
+                                                                   jobjectArray j_metadata_keys,
+                                                                   jobjectArray j_metadata_values,
+                                                                   jint j_compression,
+                                                                   jintArray j_precisions,
+                                                                   jbooleanArray j_is_map,
+                                                                   jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2158,48 +2471,60 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_orc_writer_handle* ret =
+      new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jclass, jlong j_state,
-                                                               jlong j_table, jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_orig = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig);
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::table_view* tview_orig = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::table_view tview       = cudf::jni::remove_validity_if_needed(tview_orig);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -2212,12 +2537,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass, jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_orc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2226,25 +2552,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   CATCH_STD(env, )
 }
 
-JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv *env,
+JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv* env,
                                                                                   jclass,
-                                                                                  jlong j_state) {
+                                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", nullptr);
 
   using namespace cudf::io;
-  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const *>(j_state);
+  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const*>(j_state);
   try {
     cudf::jni::auto_set_device(env);
-    if (!state->stats) {
-      return nullptr;
-    }
+    if (!state->stats) { return nullptr; }
 
-    auto const &stats = *state->stats;
-    auto output = cudf::jni::native_jdoubleArray(env, 4);
-    output[0] = static_cast<jdouble>(stats.num_compressed_bytes());
-    output[1] = static_cast<jdouble>(stats.num_failed_bytes());
-    output[2] = static_cast<jdouble>(stats.num_skipped_bytes());
-    output[3] = static_cast<jdouble>(stats.compression_ratio());
+    auto const& stats = *state->stats;
+    auto output       = cudf::jni::native_jdoubleArray(env, 4);
+    output[0]         = static_cast<jdouble>(stats.num_compressed_bytes());
+    output[1]         = static_cast<jdouble>(stats.num_failed_bytes());
+    output[2]         = static_cast<jdouble>(stats.num_skipped_bytes());
+    output[3]         = static_cast<jdouble>(stats.compression_ratio());
 
     return output.get_jArray();
   }
@@ -2252,8 +2577,8 @@ JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistic
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jobject consumer,
-    jobject host_memory_allocator) {
+  JNIEnv* env, jclass, jobjectArray j_col_names, jobject consumer, jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -2261,18 +2586,20 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
     cudf::jni::native_jstringArray col_names(env, j_col_names);
 
     std::shared_ptr<cudf::jni::jni_arrow_output_stream> data_sink(
-        new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass,
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv* env,
+                                                                        jclass,
                                                                         jobjectArray j_col_names,
-                                                                        jstring j_output_path) {
+                                                                        jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_output_path, "null output path", 0);
   try {
@@ -2280,22 +2607,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *
     cudf::jni::native_jstringArray col_names(env, j_col_names);
     cudf::jni::native_jstring output_path(env, j_output_path);
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_state,
-                                                                          jlong j_table) {
+                                                                          jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "null table", 0);
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2311,17 +2640,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv *env, jclass,
-                                                                         jlong j_state,
-                                                                         jlong arrow_table_handle,
-                                                                         jlong max_chunk) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong arrow_table_handle, jlong max_chunk)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2330,12 +2658,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, jclass,
-                                                                  jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2344,8 +2674,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, j
   CATCH_STD(env, )
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *env, jclass,
-                                                                       jstring j_input_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv* env,
+                                                                       jclass,
+                                                                       jstring j_input_path)
+{
   JNI_NULL_CHECK(env, j_input_path, "null input path", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -2355,25 +2687,29 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv *env, jclass,
-                                                                         jobject provider) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv* env,
+                                                                         jclass,
+                                                                         jobject provider)
+{
   JNI_NULL_CHECK(env, provider, "null provider", 0);
   try {
     cudf::jni::auto_set_device(env);
     std::shared_ptr<cudf::jni::jni_arrow_input_stream> data_source(
-        new cudf::jni::jni_arrow_input_stream(env, provider));
+      new cudf::jni::jni_arrow_input_stream(env, provider));
     return ptr_as_jlong(new cudf::jni::native_arrow_ipc_reader_handle(data_source));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv* env,
+                                                                                jclass,
                                                                                 jlong j_state,
-                                                                                jint row_target) {
+                                                                                jint row_target)
+{
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2385,10 +2721,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jclass,
-                                                                 jlong arrow_table_handle) {
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong arrow_table_handle)
+{
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2398,11 +2736,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arrow_table_handle) {
+Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arrow_table_handle)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow handle", 0);
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2411,12 +2750,12 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_reader_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2426,583 +2765,772 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        }
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      }
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->left_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->left_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.left_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.left_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass,
-                                                                              jlong j_left_table,
-                                                                              jlong j_right_table,
-                                                                              jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_join_size(*left_table, *right_table,
-                                                      condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_left_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_left_table,
+                                                                 jlong j_right_table,
+                                                                 jlong j_condition,
+                                                                 jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong j_left_keys,
+                                                                         jlong j_right_keys,
+                                                                         jlong j_left_condition,
+                                                                         jlong j_right_condition,
+                                                                         jlong j_condition,
+                                                                         jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join_size(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(JNIEnv* env,
+                                                          jclass,
+                                                          jlong j_left_keys,
+                                                          jlong j_right_keys,
+                                                          jlong j_left_condition,
+                                                          jlong j_right_condition,
+                                                          jlong j_condition,
+                                                          jboolean j_nulls_equal,
+                                                          jlong j_output_row_count,
+                                                          jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::inner_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::inner_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                  std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-            maps;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        }
-        // Unique join returns {right map, left map} but all the other joins
-        // return {left map, right map}. Swap here to make it consistent.
-        return std::make_pair(std::move(maps.second), std::move(maps.first));
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+        maps;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      }
+      // Unique join returns {right map, left map} but all the other joins
+      // return {left map, right map}. Swap here to make it consistent.
+      return std::make_pair(std::move(maps.second), std::move(maps.first));
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_left_table,
-                                                                    jlong j_right_hash_join) {
+                                                                    jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->inner_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->inner_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass,
-                                                                               jlong j_left_table,
-                                                                               jlong j_right_table,
-                                                                               jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_inner_join_size(*left_table, *right_table,
-                                                       condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_inner_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_left_table,
+                                                                  jlong j_right_table,
+                                                                  jlong j_condition,
+                                                                  jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong j_left_keys,
+                                                                          jlong j_right_keys,
+                                                                          jlong j_left_condition,
+                                                                          jlong j_right_condition,
+                                                                          jlong j_condition,
+                                                                          jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join_size(left_keys, right_keys, left_condition, right_condition,
-                                           condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(JNIEnv* env,
+                                                   jclass,
+                                                   jlong j_left_keys,
+                                                   jlong j_right_keys,
+                                                   jlong j_left_condition,
+                                                   jlong j_right_condition,
+                                                   jlong j_condition,
+                                                   jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(JNIEnv* env,
+                                                           jclass,
+                                                           jlong j_left_keys,
+                                                           jlong j_right_keys,
+                                                           jlong j_left_condition,
+                                                           jlong j_right_condition,
+                                                           jlong j_condition,
+                                                           jboolean j_nulls_equal,
+                                                           jlong j_output_row_count,
+                                                           jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::full_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::full_join(left, right, nulleq);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->full_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->full_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.full_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.full_join(left, output_row_count);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_full_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_full_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_full_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_full_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_semi_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_semi_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_semi_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_semi_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_semi_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_semi_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_anti_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_anti_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_anti_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_anti_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_anti_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_anti_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong left_table,
-                                                                 jlong right_table) {
+                                                                 jlong right_table)
+{
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const left = reinterpret_cast<cudf::table_view const *>(left_table);
-    auto const right = reinterpret_cast<cudf::table_view const *>(right_table);
+    auto const left  = reinterpret_cast<cudf::table_view const*>(left_table);
+    auto const right = reinterpret_cast<cudf::table_view const*>(right_table);
     return convert_table_for_return(env, cudf::cross_join(*left, *right));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env, jclass,
-                                                                    jlongArray j_cudf_table_view) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlongArray j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *table_view = reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    cudf::table_view* table_view = reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
     return release_as_jlong(cudf::interleave_columns(*table_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env, jclass,
-                                                                   jlongArray table_handles) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlongArray table_handles)
+{
   JNI_NULL_CHECK(env, table_handles, "input tables are null", NULL);
   try {
     cudf::jni::auto_set_device(env);
@@ -3013,12 +3541,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong input_table,
                                                                  jlong partition_column,
                                                                  jint number_of_partitions,
-                                                                 jintArray output_offsets) {
-
+                                                                 jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -3026,11 +3555,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
-    auto const n_part_column = reinterpret_cast<cudf::column_view const *>(partition_column);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
+    auto const n_part_column = reinterpret_cast<cudf::column_view const*>(partition_column);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
+      cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
 
     // for what ever reason partition returns the length of the result at then
     // end and hash partition/round robin do not, so skip the last entry for
@@ -3043,10 +3572,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
-    JNIEnv *env, jclass, jlong input_table, jintArray columns_to_hash, jint hash_function,
-    jint number_of_partitions, jint seed, jintArray output_offsets) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_table,
+                                                                     jintArray columns_to_hash,
+                                                                     jint hash_function,
+                                                                     jint number_of_partitions,
+                                                                     jint seed,
+                                                                     jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, columns_to_hash, "columns_to_hash is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -3054,9 +3588,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const hash_func = static_cast<cudf::hash_id>(hash_function);
-    auto const hash_seed = static_cast<uint32_t>(seed);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    auto const hash_func     = static_cast<cudf::hash_id>(hash_function);
+    auto const hash_seed     = static_cast<uint32_t>(seed);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL);
 
@@ -3064,7 +3598,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
                                                      n_columns_to_hash.end());
 
     auto [partitioned_table, partition_offsets] = cudf::hash_partition(
-        *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
+      *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3074,9 +3608,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
-    JNIEnv *env, jclass, jlong input_table, jint num_partitions, jint start_partition,
-    jintArray output_offsets) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_table,
+                                                                           jint num_partitions,
+                                                                           jint start_partition,
+                                                                           jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
   JNI_ARG_CHECK(env, num_partitions > 0, "num_partitions <= 0", NULL);
@@ -3084,10 +3622,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
+      cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3097,10 +3635,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByAggregate(JNIEnv* env,
+                                           jclass,
+                                           jlong input_table,
+                                           jintArray keys,
+                                           jintArray aggregate_column_indices,
+                                           jlongArray agg_instances,
+                                           jboolean ignore_null_keys,
+                                           jboolean jkey_sorted,
+                                           jbooleanArray jkeys_sort_desc,
+                                           jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3108,7 +3654,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3119,11 +3665,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3134,12 +3683,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
       cudf::groupby::aggregation_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_aggregation *agg =
-          dynamic_cast<cudf::groupby_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation",
-                    nullptr);
+      cudf::groupby_aggregation* agg = dynamic_cast<cudf::groupby_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_aggregation> cloned(
-          dynamic_cast<cudf::groupby_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3152,7 +3700,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.aggregate(requests);
+      grouper.aggregate(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3167,10 +3715,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByScan(JNIEnv* env,
+                                      jclass,
+                                      jlong input_table,
+                                      jintArray keys,
+                                      jintArray aggregate_column_indices,
+                                      jlongArray agg_instances,
+                                      jboolean ignore_null_keys,
+                                      jboolean jkey_sorted,
+                                      jbooleanArray jkeys_sort_desc,
+                                      jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3178,7 +3734,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3189,11 +3745,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3204,12 +3763,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
       cudf::groupby::scan_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_scan_aggregation *agg =
-          dynamic_cast<cudf::groupby_scan_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr,
-                    "aggregation is not an instance of groupby_scan_aggregation", nullptr);
+      cudf::groupby_scan_aggregation* agg =
+        dynamic_cast<cudf::groupby_scan_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_scan_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_scan_aggregation> cloned(
-          dynamic_cast<cudf::groupby_scan_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_scan_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3222,7 +3781,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.scan(requests);
+      grouper.scan(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3237,10 +3796,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray replace_column_indices,
-    jbooleanArray is_preceding, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByReplaceNulls(JNIEnv* env,
+                                              jclass,
+                                              jlong input_table,
+                                              jintArray keys,
+                                              jintArray replace_column_indices,
+                                              jbooleanArray is_preceding,
+                                              jboolean ignore_null_keys,
+                                              jboolean jkey_sorted,
+                                              jbooleanArray jkeys_sort_desc,
+                                              jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, replace_column_indices, "input replace_column_indices are null", NULL);
@@ -3248,7 +3815,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, replace_column_indices);
     cudf::jni::native_jbooleanArray n_is_preceding(env, is_preceding);
@@ -3259,11 +3826,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3275,7 +3845,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     cudf::table_view n_replace_table(n_replace_cols);
 
     std::vector<cudf::replace_policy> policies = n_is_preceding.transform_if_else(
-        cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
+      cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
 
     auto [keys, results] = grouper.replace_nulls(n_replace_table, policies);
     return convert_table_for_return(env, keys, results);
@@ -3283,48 +3853,51 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclass,
-                                                              jlong input_jtable, jlong mask_jcol) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv* env,
+                                                              jclass,
+                                                              jlong input_jtable,
+                                                              jlong mask_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const mask = reinterpret_cast<cudf::column_view const *>(mask_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const mask  = reinterpret_cast<cudf::column_view const*>(mask_jcol);
     return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jboolean nulls_equal) {
+                                                               jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
-    return cudf::distinct_count(*input, nulls_equal ? cudf::null_equality::EQUAL :
-                                                      cudf::null_equality::UNEQUAL);
+    return cudf::distinct_count(
+      *input, nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *env, jclass,
-                                                                      jlong input_jtable,
-                                                                      jintArray key_columns,
-                                                                      jint keep,
-                                                                      jboolean nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
+  JNIEnv* env, jclass, jlong input_jtable, jintArray key_columns, jint keep, jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
     static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched.");
     auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns);
     auto const keys_indices =
-        std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
+      std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
     auto const keep_option = [&] {
       switch (keep) {
         case 0: return cudf::duplicate_keep_option::KEEP_ANY;
@@ -3332,54 +3905,60 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *en
         case 2: return cudf::duplicate_keep_option::KEEP_LAST;
         case 3: return cudf::duplicate_keep_option::KEEP_NONE;
         default:
-          JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Invalid `keep` option",
+          JNI_THROW_NEW(env,
+                        cudf::jni::ILLEGAL_ARG_CLASS,
+                        "Invalid `keep` option",
                         cudf::duplicate_keep_option::KEEP_ANY);
       }
     }();
 
     auto result =
-        cudf::distinct(*input, keys_indices, keep_option,
-                       nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
-                       cudf::nan_equality::ALL_EQUAL, rmm::mr::get_current_device_resource());
+      cudf::distinct(*input,
+                     keys_indices,
+                     keep_option,
+                     nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
+                     cudf::nan_equality::ALL_EQUAL,
+                     rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong j_map, jboolean check_bounds) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jboolean check_bounds)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map   = reinterpret_cast<cudf::column_view const*>(j_map);
     auto bounds_policy =
-        check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
+      check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
     return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
-                                                                    jlong j_input, jlong j_map,
-                                                                    jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto const input  = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(*input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
-                                                                      jlongArray j_input,
-                                                                      jlong j_map, jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(
+  JNIEnv* env, jclass, jlongArray j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
@@ -3387,81 +3966,94 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
     cudf::jni::auto_set_device(env);
     auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
     std::vector<std::reference_wrapper<cudf::scalar const>> input;
-    std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input),
-                   [](auto &scalar) { return std::ref(*scalar); });
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    std::transform(
+      scalars_array.begin(), scalars_array.end(), std::back_inserter(input), [](auto& scalar) {
+        return std::ref(*scalar);
+      });
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jint count) {
+                                                                         jint count)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
     return convert_table_for_return(env, cudf::repeat(*input, count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jlong count_jcol) {
+                                                                         jlong count_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const count = reinterpret_cast<cudf::column_view const*>(count_jcol);
     return convert_table_for_return(env, cudf::repeat(*input, *count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv *env, jclass, jlong input_jtable,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv* env,
+                                                        jclass,
+                                                        jlong input_jtable,
                                                         jlong values_jtable,
                                                         jbooleanArray desc_flags,
                                                         jbooleanArray are_nulls_smallest,
-                                                        jboolean is_upper_bound) {
+                                                        jboolean is_upper_bound)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, values_jtable, "values table is null", 0);
   using cudf::column;
   using cudf::table_view;
   try {
     cudf::jni::auto_set_device(env);
-    table_view *input = reinterpret_cast<table_view *>(input_jtable);
-    table_view *values = reinterpret_cast<table_view *>(values_jtable);
+    table_view* input  = reinterpret_cast<table_view*>(input_jtable);
+    table_view* values = reinterpret_cast<table_view*>(values_jtable);
     cudf::jni::native_jbooleanArray const n_desc_flags(env, desc_flags);
     cudf::jni::native_jbooleanArray const n_are_nulls_smallest(env, are_nulls_smallest);
 
     std::vector<cudf::order> column_desc_flags{
-        n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
+      n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
     std::vector<cudf::null_order> column_null_orders{
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
 
-    JNI_ARG_CHECK(env, (column_desc_flags.size() == column_null_orders.size()),
-                  "null-order and sort-order size mismatch", 0);
+    JNI_ARG_CHECK(env,
+                  (column_desc_flags.size() == column_null_orders.size()),
+                  "null-order and sort-order size mismatch",
+                  0);
 
     return release_as_jlong(
-        is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders) :
-                         cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
+      is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders)
+                     : cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv *env, jclass,
+JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_table,
-                                                                         jintArray split_indices) {
+                                                                         jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
 
     std::vector<cudf::size_type> indices(n_split_indices.data(),
@@ -3469,42 +4061,50 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv
 
     std::vector<cudf::packed_table> result = cudf::contiguous_split(*n_table, indices);
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
     return n_result.wrapped();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(JNIEnv *env, jclass,
-                                                                  jlong input_table,
-                                                                  jlong bounce_buffer_size,
-                                                                  jlong memoryResourceHandle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(
+  JNIEnv* env, jclass, jlong input_table, jlong bounce_buffer_size, jlong memoryResourceHandle)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     // `temp_mr` is the memory resource that `cudf::chunked_pack` will use to create temporary
     // and scratch memory only.
-    auto temp_mr = memoryResourceHandle != 0 ?
-                       reinterpret_cast<rmm::mr::device_memory_resource *>(memoryResourceHandle) :
-                       rmm::mr::get_current_device_resource();
+    auto temp_mr      = memoryResourceHandle != 0
+                          ? reinterpret_cast<rmm::mr::device_memory_resource*>(memoryResourceHandle)
+                          : rmm::mr::get_current_device_resource();
     auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
     return reinterpret_cast<jlong>(chunked_pack.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jlongArray j_default_output,
-    jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
-    jintArray j_preceding, jintArray j_following, jbooleanArray j_unbounded_preceding,
-    jbooleanArray j_unbounded_following, jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rollingWindowAggregate(JNIEnv* env,
+                                                 jclass,
+                                                 jlong j_input_table,
+                                                 jintArray j_keys,
+                                                 jlongArray j_default_output,
+                                                 jintArray j_aggregate_column_indices,
+                                                 jlongArray j_agg_instances,
+                                                 jintArray j_min_periods,
+                                                 jintArray j_preceding,
+                                                 jintArray j_following,
+                                                 jbooleanArray j_unbounded_preceding,
+                                                 jbooleanArray j_unbounded_following,
+                                                 jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3517,7 +4117,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray values{env, j_aggregate_column_indices};
     cudf::jni::native_jpointerArray<cudf::aggregation> agg_instances(env, j_agg_instances);
@@ -3529,37 +4129,47 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     cudf::jni::native_jbooleanArray unbounded_following{env, j_unbounded_following};
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
-
-      int agg_column_index = values[i];
-      auto const preceding_window_bounds = unbounded_preceding[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(preceding[i]);
-      auto const following_window_bounds = unbounded_following[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(following[i]);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
+
+      int agg_column_index               = values[i];
+      auto const preceding_window_bounds = unbounded_preceding[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(preceding[i]);
+      auto const following_window_bounds = unbounded_following[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(following[i]);
 
       if (default_output[i] != nullptr) {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), *default_output[i],
-            preceding_window_bounds, following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       *default_output[i],
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       } else {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), preceding_window_bounds,
-            following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       }
     }
 
@@ -3569,13 +4179,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jintArray j_orderby_column_indices,
-    jbooleanArray j_is_orderby_ascending, jintArray j_aggregate_column_indices,
-    jlongArray j_agg_instances, jintArray j_min_periods, jlongArray j_preceding,
-    jlongArray j_following, jintArray j_preceding_extent, jintArray j_following_extent,
-    jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(JNIEnv* env,
+                                                      jclass,
+                                                      jlong j_input_table,
+                                                      jintArray j_keys,
+                                                      jintArray j_orderby_column_indices,
+                                                      jbooleanArray j_is_orderby_ascending,
+                                                      jintArray j_aggregate_column_indices,
+                                                      jlongArray j_agg_instances,
+                                                      jintArray j_min_periods,
+                                                      jlongArray j_preceding,
+                                                      jlongArray j_following,
+                                                      jintArray j_preceding_extent,
+                                                      jintArray j_following_extent,
+                                                      jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_orderby_column_indices, "input orderby_column_indices are null", NULL);
@@ -3591,7 +4210,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray orderbys{env, j_orderby_column_indices};
     cudf::jni::native_jbooleanArray orderbys_ascending{env, j_is_orderby_ascending};
@@ -3604,21 +4223,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     cudf::jni::native_jpointerArray<cudf::scalar> following(env, j_following);
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      int agg_column_index = values[i];
-      cudf::column_view const &order_by_column = input_table->column(orderbys[i]);
-      cudf::data_type order_by_type = order_by_column.type();
-      cudf::data_type duration_type = order_by_type;
+      int agg_column_index                     = values[i];
+      cudf::column_view const& order_by_column = input_table->column(orderbys[i]);
+      cudf::data_type order_by_type            = order_by_column.type();
+      cudf::data_type duration_type            = order_by_type;
 
       // Range extents are defined as:
       // a) 0 == CURRENT ROW
@@ -3626,8 +4246,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       // c) 2 == UNBOUNDED
       // Must set unbounded_type for only the BOUNDED case.
       auto constexpr CURRENT_ROW = 0;
-      auto constexpr BOUNDED = 1;
-      auto constexpr UNBOUNDED = 2;
+      auto constexpr BOUNDED     = 1;
+      auto constexpr UNBOUNDED   = 2;
       if (preceding_extent[i] != BOUNDED || following_extent[i] != BOUNDED) {
         switch (order_by_type.id()) {
           case cudf::type_id::TIMESTAMP_DAYS:
@@ -3649,11 +4269,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
         }
       }
 
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
 
-      auto const make_window_bounds = [&](auto const &range_extent, auto const *p_scalar) {
+      auto const make_window_bounds = [&](auto const& range_extent, auto const* p_scalar) {
         if (range_extent == CURRENT_ROW) {
           return cudf::range_window_bounds::current_row(duration_type);
         } else if (range_extent == UNBOUNDED) {
@@ -3664,11 +4284,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       };
 
       result_columns.emplace_back(cudf::grouped_range_rolling_window(
-          groupby_keys, order_by_column,
-          orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
-          input_table->column(agg_column_index),
-          make_window_bounds(preceding_extent[i], preceding[i]),
-          make_window_bounds(following_extent[i], following[i]), min_periods[i], *agg));
+        groupby_keys,
+        order_by_column,
+        orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
+        input_table->column(agg_column_index),
+        make_window_bounds(preceding_extent[i], preceding[i]),
+        make_window_bounds(following_extent[i], following[i]),
+        min_periods[i],
+        *agg));
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
@@ -3677,72 +4300,88 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jint column_index) {
+                                                               jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong input_jtable,
-                                                                       jint column_index) {
+                                                                       jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_jtable,
-                                                                    jint column_index) {
+                                                                    jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong input_jtable,
-                                                                            jint column_index) {
+                                                                            jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclass, jlong j_table) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv* env, jclass, jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_table);
     return release_as_jlong(cudf::row_bit_count(*input_table));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
-    JNIEnv *env, jclass, jlong jinput_table, jintArray jkey_indices, jboolean jignore_null_keys,
-    jboolean jkey_sorted, jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first,
-    jboolean genUniqKeys) {
+JNIEXPORT jobject JNICALL
+Java_ai_rapids_cudf_Table_contiguousSplitGroups(JNIEnv* env,
+                                                jclass,
+                                                jlong jinput_table,
+                                                jintArray jkey_indices,
+                                                jboolean jignore_null_keys,
+                                                jboolean jkey_sorted,
+                                                jbooleanArray jkeys_sort_desc,
+                                                jbooleanArray jkeys_null_first,
+                                                jboolean genUniqKeys)
+{
   JNI_NULL_CHECK(env, jinput_table, "table native handle is null", 0);
   JNI_NULL_CHECK(env, jkey_indices, "key indices are null", 0);
   // Two main steps to split the groups in the input table.
@@ -3753,7 +4392,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(jinput_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(jinput_table);
 
     // Prepares arguments for the groupby:
     //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
@@ -3761,15 +4400,15 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
                                              n_key_indices.data() + n_key_indices.size());
     auto keys = input_table->select(key_indices);
     auto null_handling =
-        jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
+      jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
     auto keys_are_sorted = jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO;
     auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, key_indices.size());
     auto null_precedence =
-        cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
+      cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
 
     // Constructs a groupby
-    cudf::groupby::groupby grouper(keys, null_handling, keys_are_sorted, column_order,
-                                   null_precedence);
+    cudf::groupby::groupby grouper(
+      keys, null_handling, keys_are_sorted, column_order, null_precedence);
 
     // 1) Gets the groups(keys, offsets, values) from groupby.
     //
@@ -3796,14 +4435,14 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     // original order of columns (same order with that in input table).
     std::vector<cudf::column_view> grouped_cols(key_indices.size() + num_value_cols);
     // key columns
-    auto key_view = groups.keys->view();
+    auto key_view    = groups.keys->view();
     auto key_view_it = key_view.begin();
     for (auto key_id : key_indices) {
       grouped_cols.at(key_id) = std::move(*key_view_it);
       key_view_it++;
     }
     // value columns
-    auto value_view = groups.values->view();
+    auto value_view    = groups.values->view();
     auto value_view_it = value_view.begin();
     for (auto value_id : value_indices) {
       grouped_cols.at(value_id) = std::move(*value_view_it);
@@ -3812,11 +4451,11 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     cudf::table_view grouped_table(grouped_cols);
     // When no key columns, uses the input table instead, because the output
     // of 'get_groups' is empty.
-    auto &grouped_view = key_indices.empty() ? *input_table : grouped_table;
+    auto& grouped_view = key_indices.empty() ? *input_table : grouped_table;
 
     // Resolves the split indices from offsets vector directly to avoid copying. Since
     // the offsets vector may be very large if there are too many small groups.
-    std::vector<cudf::size_type> &split_indices = groups.offsets;
+    std::vector<cudf::size_type>& split_indices = groups.offsets;
     // Offsets layout is [0, split indices..., num_rows] or [0] for empty keys, so
     // need to removes the first and last elements. First remove last one.
     split_indices.pop_back();
@@ -3825,23 +4464,21 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     std::unique_ptr<cudf::table> group_by_result_table;
     if (genUniqKeys) {
       // generate gather map column from `split_indices`
-      auto begin = std::cbegin(split_indices);
-      auto end = std::cend(split_indices);
+      auto begin      = std::cbegin(split_indices);
+      auto end        = std::cend(split_indices);
       auto const size = cudf::distance(begin, end);
-      auto const vec = thrust::host_vector<cudf::size_type>(begin, end);
-      auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type),
-                                    cudf::get_default_stream()};
+      auto const vec  = thrust::host_vector<cudf::size_type>(begin, end);
+      auto buf =
+        rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type), cudf::get_default_stream()};
       auto gather_map_col = std::make_unique<cudf::column>(
-          cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
+        cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
 
       // gather the first key in each group to remove duplicated ones.
       group_by_result_table = cudf::gather(groups.keys->view(), gather_map_col->view());
     }
 
     // remove the first 0 if it exists
-    if (!split_indices.empty()) {
-      split_indices.erase(split_indices.begin());
-    }
+    if (!split_indices.empty()) { split_indices.erase(split_indices.begin()); }
 
     // 2) Splits the groups.
     std::vector<cudf::packed_table> result = cudf::contiguous_split(grouped_view, split_indices);
@@ -3851,10 +4488,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
 
     //  Returns the split result.
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
 
     jobjectArray groups_array = n_result.wrapped();
@@ -3869,17 +4506,17 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong n, jboolean replacement,
-                                                              jlong seed) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(
+  JNIEnv* env, jclass, jlong j_input, jlong n, jboolean replacement, jlong seed)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
     auto sample_with_replacement =
-        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+      replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
     return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index d722aaa84fe..a32e7d27085 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,118 +14,131 @@
  * limitations under the License.
  */
 
-#include <cstddef>
-#include <utility>
-#include <vector>
+#include "aggregation128_utils.hpp"
 
-#include <cuda/functional>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include "aggregation128_utils.hpp"
+#include <cstddef>
+#include <utility>
+#include <vector>
 
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
 class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
-public:
-  chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
-                  uint64_t const *chunks2, int64_t const *chunks3)
-      : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
-        chunks3(chunks3) {}
+ public:
+  chunk_assembler(bool* overflows,
+                  uint64_t const* chunks0,
+                  uint64_t const* chunks1,
+                  uint64_t const* chunks2,
+                  int64_t const* chunks3)
+    : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2), chunks3(chunks3)
+  {
+  }
 
-  __device__ __int128_t operator()(cudf::size_type i) const {
+  __device__ __int128_t operator()(cudf::size_type i) const
+  {
     // Starting with the least significant input and moving to the most significant, propagate the
     // upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
     // of each 64-bit chunk into the next chunk.
-    uint64_t const c0 = chunks0[i];
-    uint64_t const c1 = chunks1[i] + (c0 >> 32);
-    uint64_t const c2 = chunks2[i] + (c1 >> 32);
-    int64_t const c3 = chunks3[i] + (c2 >> 32);
+    uint64_t const c0      = chunks0[i];
+    uint64_t const c1      = chunks1[i] + (c0 >> 32);
+    uint64_t const c2      = chunks2[i] + (c1 >> 32);
+    int64_t const c3       = chunks3[i] + (c2 >> 32);
     uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
-    int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);
+    int64_t const upper64  = (c3 << 32) | static_cast<uint32_t>(c2);
 
     // check for overflow by ensuring the sign bit matches the top carry bits
     int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
-    int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
-    overflows[i] = (replicated_sign_bit != top_carry_bits);
+    int32_t const top_carry_bits      = static_cast<int32_t>(c3 >> 32);
+    overflows[i]                      = (replicated_sign_bit != top_carry_bits);
 
     return (static_cast<__int128_t>(upper64) << 64) | lower64;
   }
 
-private:
+ private:
   // output column for overflow detected
-  bool *const overflows;
+  bool* const overflows;
 
   // input columns for the four 64-bit values
-  uint64_t const *const chunks0;
-  uint64_t const *const chunks1;
-  uint64_t const *const chunks2;
-  int64_t const *const chunks3;
+  uint64_t const* const chunks0;
+  uint64_t const* const chunks1;
+  uint64_t const* const chunks2;
+  int64_t const* const chunks3;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf::jni {
 
 // Extract a 32-bit chunk from a 128-bit value.
-std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
-                                              int chunk_idx, rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const& in_col,
+                                              cudf::data_type type,
+                                              int chunk_idx,
+                                              rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
   CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
                "not a 32-bit integer type");
   auto const num_rows = in_col.size();
   auto out_col =
-      cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
-  auto out_view = out_col->mutable_view();
+    cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
+  auto out_view       = out_col->mutable_view();
   auto const in_begin = in_col.begin<int32_t>();
 
   // Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
   thrust::transform_iterator transform_iter{
-      thrust::counting_iterator{0},
-      cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
+    thrust::counting_iterator{0},
+    cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
   thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};
 
-  thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
-               out_view.data<int32_t>());
+  thrust::copy(
+    rmm::exec_policy(stream), stride_iter, stride_iter + num_rows, out_view.data<int32_t>());
   return out_col;
 }
 
 // Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
-std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
+std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const& chunks_table,
                                                   cudf::data_type output_type,
-                                                  rmm::cuda_stream_view stream) {
+                                                  rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
   auto const num_rows = chunks_table.num_rows();
-  auto const chunks0 = chunks_table.column(0);
-  auto const chunks1 = chunks_table.column(1);
-  auto const chunks2 = chunks_table.column(2);
-  auto const chunks3 = chunks_table.column(3);
+  auto const chunks0  = chunks_table.column(0);
+  auto const chunks1  = chunks_table.column(1);
+  auto const chunks2  = chunks_table.column(2);
+  auto const chunks3  = chunks_table.column(3);
   CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
-                   cudf::size_of(chunks2.type()) == 8 &&
-                   chunks3.type().id() == cudf::type_id::INT64,
+                 cudf::size_of(chunks2.type()) == 8 && chunks3.type().id() == cudf::type_id::INT64,
                "chunks type mismatch");
   std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  copy_bitmask(chunks0), chunks0.null_count()));
-  columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0),
-                                                  chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8}, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    output_type, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
   auto overflows_view = columns[0]->mutable_view();
   auto assembled_view = columns[1]->mutable_view();
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(num_rows),
                     assembled_view.begin<__int128_t>(),
-                    chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
-                                    chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
+                    chunk_assembler(overflows_view.begin<bool>(),
+                                    chunks0.begin<uint64_t>(),
+                                    chunks1.begin<uint64_t>(),
+                                    chunks2.begin<uint64_t>(),
                                     chunks3.begin<int64_t>()));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp
index a1437606cdf..94860cea53b 100644
--- a/java/src/main/native/src/aggregation128_utils.hpp
+++ b/java/src/main/native/src/aggregation128_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <memory>
-
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <memory>
+
 namespace cudf::jni {
 
 /**
@@ -39,9 +40,11 @@ namespace cudf::jni {
  * @param stream    CUDA stream to use
  * @return          A column containing the extracted 32-bit integer values
  */
-std::unique_ptr<cudf::column>
-extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
-                rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> extract_chunk32(
+  cudf::column_view const& col,
+  cudf::data_type dtype,
+  int chunk_idx,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
@@ -63,8 +66,9 @@ extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_i
  *                     requested type. The boolean value will be true if an overflow was detected
  *                     for that row's value.
  */
-std::unique_ptr<cudf::table>
-assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
-                     rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::table> assemble128_from_sum(
+  cudf::table_view const& chunks_table,
+  cudf::data_type output_type,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu
index 9d29e66ec59..8e0df7dd89a 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.cu
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu
@@ -13,20 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "check_nvcomp_output_sizes.hpp"
+
 #include <cudf/utilities/error.hpp>
+
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/equal.h>
 
-#include "check_nvcomp_output_sizes.hpp"
-
 namespace {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace java {
@@ -35,13 +36,17 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream) {
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream)
+{
   NVTX3_FUNC_RANGE_IN(java_domain);
-  return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes,
-                       dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes);
+  return thrust::equal(rmm::exec_policy(stream),
+                       dev_uncompressed_sizes,
+                       dev_uncompressed_sizes + num_chunks,
+                       dev_actual_uncompressed_sizes);
 }
 
-} // namespace java
-} // namespace cudf
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
index 00b36471a85..594be6c7c96 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.hpp
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,8 +26,9 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream);
-} // namespace java
-} // namespace cudf
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream);
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp
index 1f1e73a1a4b..ee05aa95328 100644
--- a/java/src/main/native/src/csv_chunked_writer.hpp
+++ b/java/src/main/native/src/csv_chunked_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
-#include <cassert>
+#include "jni_writer_data_sink.hpp"
 
 #include <cudf/io/csv.hpp>
 
-#include "jni_writer_data_sink.hpp"
+#include <cassert>
 
 namespace cudf::jni::io {
 
@@ -27,17 +27,17 @@ namespace cudf::jni::io {
  * @brief Class to write multiple Tables into the jni_writer_data_sink.
  */
 class csv_chunked_writer {
-
   cudf::io::csv_writer_options _options;
   std::unique_ptr<cudf::jni::jni_writer_data_sink> _sink;
 
-  bool _first_write_completed = false; ///< Decides if header should be written.
+  bool _first_write_completed = false;  ///< Decides if header should be written.
 
-public:
+ public:
   explicit csv_chunked_writer(cudf::io::csv_writer_options options,
-                              std::unique_ptr<cudf::jni::jni_writer_data_sink> &sink)
-      : _options{options}, _sink{std::move(sink)} {
-    auto const &sink_info = _options.get_sink();
+                              std::unique_ptr<cudf::jni::jni_writer_data_sink>& sink)
+    : _options{options}, _sink{std::move(sink)}
+  {
+    auto const& sink_info = _options.get_sink();
     // Assert invariants.
     CUDF_EXPECTS(sink_info.type() != cudf::io::io_type::FILEPATH,
                  "Currently, chunked CSV writes to files is not supported.");
@@ -52,9 +52,10 @@ class csv_chunked_writer {
     CUDF_EXPECTS(sink_info.user_sinks()[0] == _sink.get(), "Sink mismatch.");
   }
 
-  void write(cudf::table_view const &table) {
+  void write(cudf::table_view const& table)
+  {
     if (_first_write_completed) {
-      _options.enable_include_header(false); // Don't write header after the first write.
+      _options.enable_include_header(false);  // Don't write header after the first write.
     }
 
     _options.set_table(table);
@@ -64,10 +65,11 @@ class csv_chunked_writer {
     _first_write_completed = true;
   }
 
-  void close() {
+  void close()
+  {
     // Flush pending writes to sink.
     _sink->flush();
   }
 };
 
-} // namespace cudf::jni::io
+}  // namespace cudf::jni::io
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index bd82bbd2899..022493f04ab 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
+#include "jni_utils.hpp"
+
 #include <cudf/contiguous_split.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 
-#include "jni_utils.hpp"
-
 namespace cudf {
 namespace jni {
 
@@ -34,29 +34,31 @@ namespace jni {
  * @param table_result the table to convert for return
  * @param extra_columns columns not in the table that will be appended to the result.
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 /**
  * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr<cudf::table>&,
  *                                   std::vector<std::unique_ptr<cudf::column>>&&)
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>&& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 //
 // ContiguousTable APIs
 //
 
-bool cache_contiguous_table_jni(JNIEnv *env);
+bool cache_contiguous_table_jni(JNIEnv* env);
 
-void release_contiguous_table_jni(JNIEnv *env);
+void release_contiguous_table_jni(JNIEnv* env);
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count);
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count);
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length);
 
 /**
  * @brief Cache the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
@@ -64,14 +66,14 @@ native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
  * @param env the JNI Env pointer
  * @return if success
  */
-bool cache_contig_split_group_by_result_jni(JNIEnv *env);
+bool cache_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Release the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
  *
  * @param env the JNI Env pointer
  */
-void release_contig_split_group_by_result_jni(JNIEnv *env);
+void release_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -80,7 +82,7 @@ void release_contig_split_group_by_result_jni(JNIEnv *env);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -90,8 +92,9 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns);
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns);
 
 //
 // HostMemoryBuffer APIs
@@ -100,22 +103,24 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
 /**
  * Allocate a HostMemoryBuffer
  */
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
                              jobject host_memory_allocator);
 
 /**
  * Get the address of a HostMemoryBuffer
  */
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer);
 
 /**
  * Get the length of a HostMemoryBuffer
  */
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer);
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm);
+JNIEnv* get_jni_env(JavaVM* jvm);
 
 /** Set the device to use for cudf */
 void set_cudf_device(int device);
@@ -125,22 +130,22 @@ void set_cudf_device(int device);
  * set the device, throw an exception, or do nothing depending on how the application has
  * configured it via Cuda.setAutoSetDeviceMode.
  */
-void auto_set_device(JNIEnv *env);
+void auto_set_device(JNIEnv* env);
 
 /**
  * Fills all the bytes in the buffer 'buf' with 'value'.
  * The operation has not necessarily completed when this returns, but it could overlap with
  * operations occurring on other streams.
  */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value);
 
 //
 // DataSource APIs
 //
 
-bool cache_data_source_jni(JNIEnv *env);
+bool cache_data_source_jni(JNIEnv* env);
 
-void release_data_source_jni(JNIEnv *env);
+void release_data_source_jni(JNIEnv* env);
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 4de8a94182c..90408782dd0 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,16 @@
  */
 #pragma once
 
-#include <jni.h>
-
 #include <cudf/types.hpp>
 
+#include <jni.h>
+
 namespace cudf {
 namespace jni {
 
 // convert a timestamp type to the corresponding duration type
-inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
+inline cudf::data_type timestamp_to_duration(cudf::data_type dt)
+{
   cudf::type_id duration_type_id;
   switch (dt.id()) {
     case cudf::type_id::TIMESTAMP_DAYS: duration_type_id = cudf::type_id::DURATION_DAYS; break;
@@ -44,13 +45,15 @@ inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
   return cudf::data_type(duration_type_id);
 }
 
-inline bool is_decimal_type(cudf::type_id n_type) {
+inline bool is_decimal_type(cudf::type_id n_type)
+{
   return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64 ||
          n_type == cudf::type_id::DECIMAL128;
 }
 
 // create data_type including scale for decimal type
-inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
+inline cudf::data_type make_data_type(jint out_dtype, jint scale)
+{
   cudf::type_id n_type = static_cast<cudf::type_id>(out_dtype);
   cudf::data_type n_data_type;
   if (is_decimal_type(n_type)) {
@@ -61,5 +64,5 @@ inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
   return n_data_type;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp
index 74010f71011..dad2c33b731 100644
--- a/java/src/main/native/src/jni_compiled_expr.hpp
+++ b/java/src/main/native/src/jni_compiled_expr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,11 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -38,29 +43,31 @@ class compiled_expr {
   /** GPU scalar instances that correspond to literal nodes */
   std::vector<std::unique_ptr<cudf::scalar>> scalars;
 
-public:
-  cudf::ast::literal &add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
-                                  std::unique_ptr<cudf::scalar> scalar_ptr) {
+ public:
+  cudf::ast::literal& add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
+                                  std::unique_ptr<cudf::scalar> scalar_ptr)
+  {
     expressions.push_back(std::move(literal_ptr));
     scalars.push_back(std::move(scalar_ptr));
-    return static_cast<cudf::ast::literal &>(*expressions.back());
+    return static_cast<cudf::ast::literal&>(*expressions.back());
   }
 
-  cudf::ast::column_reference &
-  add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr) {
+  cudf::ast::column_reference& add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr)
+  {
     expressions.push_back(std::move(ref_ptr));
-    return static_cast<cudf::ast::column_reference &>(*expressions.back());
+    return static_cast<cudf::ast::column_reference&>(*expressions.back());
   }
 
-  cudf::ast::operation &add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr) {
+  cudf::ast::operation& add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr)
+  {
     expressions.push_back(std::move(expr_ptr));
-    return static_cast<cudf::ast::operation &>(*expressions.back());
+    return static_cast<cudf::ast::operation&>(*expressions.back());
   }
 
   /** Return the expression node at the top of the tree */
-  cudf::ast::expression &get_top_expression() const { return *expressions.back(); }
+  cudf::ast::expression& get_top_expression() const { return *expressions.back(); }
 };
 
-} // namespace ast
-} // namespace jni
-} // namespace cudf
+}  // namespace ast
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index efac6112c25..52756266beb 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,56 +15,53 @@
  */
 #pragma once
 
-#include <cudf/io/data_sink.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/data_sink.hpp>
+
 namespace cudf::jni {
 
-constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB
+constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024;  // 10 MB
 
 class jni_writer_data_sink final : public cudf::io::data_sink {
-public:
-  explicit jni_writer_data_sink(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_writer_data_sink(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
 
-    this->callback = add_global_ref(env, callback);
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_writer_data_sink() {
+  virtual ~jni_writer_data_sink()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  void host_write(void const *data, size_t size) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(data);
+  void host_write(void const* data, size_t size) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -73,8 +70,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -86,10 +83,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(gpu_data);
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(gpu_data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -99,11 +97,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                                    stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
@@ -113,20 +111,23 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     stream.synchronize();
   }
 
-  std::future<void> device_write_async(void const *gpu_data, size_t size,
-                                       rmm::cuda_stream_view stream) override {
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
     // Call the sync version until figuring out how to write asynchronously.
     device_write(gpu_data, size, stream);
     return std::async(std::launch::deferred, [] {});
   }
 
-  void flush() override {
+  void flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
   }
@@ -135,36 +136,34 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   void set_alloc_size(long size) { this->alloc_size = size; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  size_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
+  size_t total_written        = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
   jobject host_memory_allocator;
 };
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 1af7689f972..d3ee52c074c 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,17 @@
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/extract.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <maps_column_view.hpp>
+
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <maps_column_view.hpp>
 
 namespace cudf::jni {
 
 namespace {
-column_view make_lists(column_view const &lists_child, lists_column_view const &lists_of_structs) {
+column_view make_lists(column_view const& lists_child, lists_column_view const& lists_of_structs)
+{
   return column_view{data_type{type_id::LIST},
                      lists_of_structs.size(),
                      nullptr,
@@ -33,12 +37,13 @@ column_view make_lists(column_view const &lists_child, lists_column_view const &
                      lists_of_structs.offset(),
                      {lists_of_structs.offsets(), lists_child}};
 }
-} // namespace
+}  // namespace
 
-maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
+maps_column_view::maps_column_view(lists_column_view const& lists_of_structs,
                                    rmm::cuda_stream_view stream)
-    : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
-      values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)} {
+  : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
+    values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)}
+{
   auto const structs = lists_of_structs.child();
   CUDF_EXPECTS(structs.type().id() == type_id::STRUCT,
                "maps_column_view input must have exactly 1 child (STRUCT) column.");
@@ -47,66 +52,78 @@ maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
 }
 
 template <typename KeyT>
-std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
-                                            KeyT const &lookup_keys, rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource *mr) {
-  auto const keys_ = maps_view.keys();
+std::unique_ptr<column> get_values_for_impl(maps_column_view const& maps_view,
+                                            KeyT const& lookup_keys,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto const keys_   = maps_view.keys();
   auto const values_ = maps_view.values();
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
-                              rmm::mr::get_current_device_resource());
-  auto constexpr absent_offset = size_type{-1};
+  auto key_indices              = lists::detail::index_of(keys_,
+                                             lookup_keys,
+                                             lists::duplicate_find_option::FIND_LAST,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+  auto constexpr absent_offset  = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
-  thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
-                  key_indices->mutable_view().template end<size_type>(), absent_offset,
+  thrust::replace(rmm::exec_policy(stream),
+                  key_indices->mutable_view().template begin<size_type>(),
+                  key_indices->mutable_view().template end<size_type>(),
+                  absent_offset,
                   nullity_offset);
   return lists::detail::extract_list_element(values_, key_indices->view(), stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(column_view const &lookup_keys, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(column_view const& lookup_keys,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return get_values_for_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(scalar const &lookup_key, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(scalar const& lookup_key,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const
+{
   return get_values_for_impl(*this, lookup_key, stream, mr);
 }
 
 template <typename KeyT>
-std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT const &lookup_keys,
+std::unique_ptr<column> contains_impl(maps_column_view const& maps_view,
+                                      KeyT const& lookup_keys,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) {
+                                      rmm::device_async_resource_ref mr)
+{
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto const contains =
-      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
+    lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_keys,
+std::unique_ptr<column> maps_column_view::contains(column_view const& lookup_keys,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return contains_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(scalar const &lookup_key,
+std::unique_ptr<column> maps_column_view::contains(scalar const& lookup_key,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   return contains_impl(*this, lookup_key, stream, mr);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/nvtx_common.hpp b/java/src/main/native/src/nvtx_common.hpp
index 8b5b04f3370..69bcdfb8521 100644
--- a/java/src/main/native/src/nvtx_common.hpp
+++ b/java/src/main/native/src/nvtx_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ namespace cudf {
 namespace jni {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index bac4d1e4b3e..1d6a3b3304a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -530,225 +530,6 @@ void testMD5HashLists() {
     }
   }
 
-  @Test
-  void testSpark32BitMurmur3HashStrings() {
-    try (ColumnVector v0 = ColumnVector.fromStrings(
-           "a", "B\nc",  "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
-           "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-           "in the MD5 hash function. This string needed to be longer.A 60 character string to " +
-           "test MD5's message padding algorithm",
-           "hiJ\ud720\ud721\ud720\ud721", null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1485273170, 1709559900, 1423943036, 176121990, 1199621434, 42)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashInts() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
-         ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDoubles() {
-    try (ColumnVector v = ColumnVector.fromBoxedDoubles(
-          0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
-          POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
-          NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
-          Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashTimestamps() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
-        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal64() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
-        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal32() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
-        0, 100, -100, 0x12345678, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDates() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
-        0, null, 100, -100, 0x12345678, null, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashFloats() {
-    try (ColumnVector v = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
-          POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
-          NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
-          Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(411, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashBools() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
-         ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashMixed() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-          "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-          "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-          "in the MD5 hash function. This string needed to be longer.",
-          null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1936985022, 720652989, 339312041, 1400354989, 769988643, 1868)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs});
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashNestedStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs1 = ColumnView.makeStructView(strings, integers);
-         ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
-         ColumnView structs3 = ColumnView.makeStructView(bools);
-         ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashListsAndNestedLists() {
-    try (ColumnVector stringListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.STRING)),
-             Arrays.asList(null, "a"),
-             Arrays.asList("B\n", ""),
-             Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
-             Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi" +
-             " hash-step data point in the Murmur3 hash function. This string needed to be longer."),
-             Collections.singletonList(""),
-             null);
-         ColumnVector strings1 = ColumnVector.fromStrings(
-             "a", "B\n", "dE\"\u0100\t\u0101",
-             "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-             "in the Murmur3 hash function. This string needed to be longer.", null, null);
-         ColumnVector strings2 = ColumnVector.fromStrings(
-             null, "", " \ud720\ud721", null, "", null);
-         ColumnView stringStruct = ColumnView.makeStructView(strings1, strings2);
-         ColumnVector stringExpected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringStruct});
-         ColumnVector stringResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringListCV});
-         ColumnVector intListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.INT32)),
-             null,
-             Arrays.asList(0, -2, 3),
-             Collections.singletonList(Integer.MAX_VALUE),
-             Arrays.asList(5, -6, null),
-             Collections.singletonList(Integer.MIN_VALUE),
-             null);
-         ColumnVector integers1 = ColumnVector.fromBoxedInts(null, 0, null, 5, Integer.MIN_VALUE, null);
-         ColumnVector integers2 = ColumnVector.fromBoxedInts(null, -2, Integer.MAX_VALUE, null, null, null);
-         ColumnVector integers3 = ColumnVector.fromBoxedInts(null, 3, null, -6, null, null);
-         ColumnVector intExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{integers1, integers2, integers3});
-         ColumnVector intResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{intListCV});
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
-         ColumnVector nestedExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{intListCV, strings1, strings2, doubles, floats});
-         ColumnVector nestedResult =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structCV})) {
-      assertColumnsAreEqual(stringExpected, stringResult);
-      assertColumnsAreEqual(intExpected, intResult);
-      assertColumnsAreEqual(nestedExpected, nestedResult);
-    }
-  }
-
   @Test
   void isNotNullTestEmptyColumn() {
     try (ColumnVector v = ColumnVector.fromBoxedInts();
diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 2edd7f36cb7..a741b0a5e31 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,8 +39,9 @@ public void testGetCudaRuntimeInfo() {
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
-            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
-          } catch (CudaFatalException ignored) {
+            Cuda.freePinned(-1L);
+          } catch (CudaFatalException fatalEx) {
+            throw new AssertionError("Expected CudaException but got fatal error", fatalEx);
           } catch (CudaException ex) {
             assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 30905783c7f..dc6eb55fc6a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -81,6 +81,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
   private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
@@ -1699,6 +1700,29 @@ void testReadORCTimeUnit() {
     }
   }
 
+  @Test
+  void testORCChunkedReader() throws IOException {
+    byte[] buffer = Files.readAllBytes(TEST_ORC_FILE_CHUNKED_READ.toPath());
+    long len = buffer.length;
+
+    try (HostMemoryBuffer hostBuf = hostMemoryAllocator.allocate(len)) {
+      hostBuf.setBytes(0, buffer, 0, len);
+      try (ORCChunkedReader reader = new ORCChunkedReader(0, 2 * 1024 * 1024, 10000,
+          ORCOptions.DEFAULT, hostBuf, 0, len)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while (reader.hasNext()) {
+          ++numChunks;
+          try (Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(10, numChunks);
+        assertEquals(1000000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testCrossJoin() {
     try (Table leftTable = new Table.TestBuilder()
@@ -3058,64 +3082,6 @@ void testMixedLeftSemiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftSemiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(2, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftSemiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testMixedLeftAntiJoinGatherMap() {
     BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
@@ -3166,64 +3132,6 @@ void testMixedLeftAntiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftAntiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 1, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftAntiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(1, 2, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testLeftSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index 66f4fe39109..4e8fc225257 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,18 +23,29 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.Arrays;
-import java.util.Optional;
 
 public class NvcompTest {
   private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
 
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
+  private final long chunkSize = 64 * 1024;
+  private final long targetIntermediteSize = Long.MAX_VALUE;
+
   @Test
   void testBatchedLZ4RoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedLZ4Compressor(chunkSize, targetIntermediteSize),
+        new BatchedLZ4Decompressor(chunkSize));
+  }
+
+  @Test
+  void testBatchedZstdRoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedZstdCompressor(chunkSize, targetIntermediteSize),
+        new BatchedZstdDecompressor(chunkSize));
+  }
+
+  void testBatchedRoundTripAsync(BatchedCompressor comp, BatchedDecompressor decomp) {
     final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
-    final long chunkSize = 64 * 1024;
-    final long targetIntermediteSize = Long.MAX_VALUE;
     final int maxElements = 1024 * 1024 + 1;
     final int numBuffers = 200;
     long[] data = new long[maxElements];
@@ -52,10 +63,8 @@ void testBatchedLZ4RoundTripAsync() {
       }
 
       // compress and decompress the buffers
-      BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize);
-
       try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream));
+               CloseableArray.wrap(comp.compress(originalBuffers.getArray(), stream));
            CloseableArray<DeviceMemoryBuffer> uncompressedBuffers =
                CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
         for (int i = 0; i < numBuffers; i++) {
@@ -64,8 +73,8 @@ void testBatchedLZ4RoundTripAsync() {
         }
 
         // decompress takes ownership of the compressed buffers and will close them
-        BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(),
-            uncompressedBuffers.getArray(), stream);
+        decomp.decompressAsync(compressedBuffers.release(), uncompressedBuffers.getArray(),
+            stream);
 
         // check the decompressed results against the original
         for (int i = 0; i < numBuffers; ++i) {
diff --git a/java/src/test/resources/splittable.orc b/java/src/test/resources/splittable.orc
new file mode 100644
index 00000000000..1f5e094534f
Binary files /dev/null and b/java/src/test/resources/splittable.orc differ
diff --git a/pyproject.toml b/pyproject.toml
index c71394058df..d343b237ee7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,19 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-[tool.pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = "^(?!(ci|cpp|conda|docs|java|notebooks|python/cudf/cudf/pandas/scripts|python/cudf/cudf_pandas_tests)).*$"
-# Allow missing docstrings for docutils
-ignore-decorators = ".*(docutils|doc_apply|copy_docstring).*"
-select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418"
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
 [tool.mypy]
 ignore_missing_imports = true
 # If we don't specify this, then mypy will check excluded files if
@@ -22,7 +8,6 @@ follow_imports = "skip"
 exclude = [
     "cudf/_lib/",
     "cudf/cudf/tests/",
-    "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
     "custreamz/custreamz/tests/",
     "dask_cudf/dask_cudf/tests/",
  ]
@@ -33,12 +18,15 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
-select = ["E", "F", "W"]
+line-length = 79
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
 ignore = [
     # whitespace before :
     "E203",
@@ -50,8 +38,9 @@ exclude = [
     # TODO: Remove this in a follow-up where we fix __all__.
     "__init__.py",
 ]
-line-length = 79
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Lots of pytest implicitly injected attributes in conftest-patch.py
 "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]
+"python/cudf/cudf/pandas/scripts/*" = ["D"]
+"python/cudf/cudf_pandas_tests/*" = ["D"]
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 23edbbc636c..ecadbf5cbbc 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -96,9 +96,6 @@ include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
-include(cmake/Modules/ProtobufHelpers.cmake)
-codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)
-
 if(DEFINED cython_lib_dir)
   rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
 endif()
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
index 305a21d0a29..c1e9d4d6116 100644
--- a/python/cudf/benchmarks/common/config.py
+++ b/python/cudf/benchmarks/common/config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Module used for global configuration of benchmarks.
 
@@ -20,6 +20,7 @@
 in this file and import them in conftest.py to ensure that they are handled
 appropriately.
 """
+
 import os
 import sys
 
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
deleted file mode 100644
index 70b8879cf18..00000000000
--- a/python/cudf/cmake/Modules/ProtobufHelpers.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Compile protobuf files to Python. All arguments are assumed to be .proto files.
-function(codegen_protoc)
-  # Allow user to provide path to protoc executable as an environment variable.
-  if(DEFINED ENV{PROTOC})
-    set(protoc_COMMAND $ENV{PROTOC})
-  else()
-    find_program(protoc_COMMAND protoc REQUIRED)
-  endif()
-
-  foreach(_proto_path IN LISTS ARGV)
-    string(REPLACE "\.proto" "_pb2\.py" pb2_py_path "${_proto_path}")
-    set(pb2_py_path "${CMAKE_CURRENT_SOURCE_DIR}/${pb2_py_path}")
-    # Note: If we ever need to process larger numbers of protobuf files we should consider switching
-    # to protobuf_generate_python from the FindProtobuf module.
-    execute_process(
-      COMMAND ${protoc_COMMAND} --python_out=. "${_proto_path}"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY
-    )
-    # Mark entire file to skip formatting.
-    file(READ "${pb2_py_path}" pb2_py)
-    file(
-      WRITE "${pb2_py_path}"
-      [=[
-# fmt: off
-]=]
-    )
-    file(APPEND "${pb2_py_path}" "${pb2_py}")
-    file(
-      APPEND "${pb2_py_path}"
-      [=[
-# fmt: on
-]=]
-    )
-  endforeach()
-endfunction()
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 5b49143fd5a..67211a1c4bf 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -99,7 +99,7 @@ def set_rand_params(self, params):
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
-                            if cudf.utils.dtypes._is_categorical_dtype(dtype)
+                            if isinstance(dtype, cudf.CategoricalDtype)
                             else pandas_dtypes_to_np_dtypes[dtype]
                             for col_name, dtype in dtype_val.items()
                         }
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index bffd508b2ef..e987529c8ba 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
     if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
         processed_dtypes = {}
         for col_name, dtype in dtype_val.items():
-            if cudf.utils.dtypes._is_categorical_dtype(dtype):
+            if isinstance(dtype, cudf.CategoricalDtype):
                 processed_dtypes[col_name] = "category"
             else:
                 processed_dtypes[col_name] = str(
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 6e53195ac2d..d685174f3c2 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -99,9 +99,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                     low=1, high=10
                 )
             else:
-                meta[
-                    "max_types_at_each_level"
-                ] = obj._max_struct_types_at_each_level
+                meta["max_types_at_each_level"] = (
+                    obj._max_struct_types_at_each_level
+                )
 
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 07f334fdc12..5a067e84f56 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -68,7 +68,6 @@ target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 set(targets_using_arrow_headers interop avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
-add_subdirectory(cpp)
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 0e24b5b7459..ae17a5f1ab6 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,16 +1,16 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.avro cimport (
+from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.avro cimport (
     avro_reader_options,
     read_avro as libcudf_read_avro,
 )
-from cudf._lib.cpp.io.types cimport table_with_metadata
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 969be426044..2e352dd7904 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -34,7 +34,7 @@ def binaryop(lhs, rhs, op, dtype):
     """
     # TODO: Shouldn't have to keep special-casing. We need to define a separate
     # pipeline for libcudf binops that don't map to Python binops.
-    if op not in {"INT_POW", "NULL_EQUALS"}:
+    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
         op = op[2:-2]
     op = op.upper()
     op = _op_map.get(op, op)
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 7ffb55a6cc6..437f44af9f0 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9c48a731cea..f33e121241d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -43,18 +43,18 @@ from cudf._lib.types import dtype_from_pylibcudf_column
 # from_pylibcudf by instead creating an empty numeric column. We will be able
 # to remove this once column factories are exposed to pylibcudf.
 
-cimport cudf._lib.cpp.copying as cpp_copying
-cimport cudf._lib.cpp.types as libcudf_types
-cimport cudf._lib.cpp.unary as libcudf_unary
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport (
+cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
+from cudf._lib.pylibcudf cimport Column as plc_Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
     make_numeric_column,
 )
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
@@ -633,7 +633,7 @@ cdef class Column:
             # TODO: This function call is what requires cimporting pylibcudf.
             # We can remove the cimport once we can directly do
             # pylibcudf.column_factories.make_numeric_column or equivalent.
-            col = pylibcudf.Column.from_libcudf(
+            col = plc_Column.from_libcudf(
                 move(
                     make_numeric_column(
                         new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
index 599b9c5a067..8fc7f4e1da0 100644
--- a/python/cudf/cudf/_lib/copying.pxd
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.contiguous_split cimport packed_columns
+from cudf._lib.pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 
 cdef class _CPackedColumns:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 6a52af520f0..796c70e615c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -26,15 +26,17 @@ from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.gather cimport (
+cimport cudf._lib.pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
 # workaround for https://github.com/cython/cython/issues/3885
diff --git a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd b/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
deleted file mode 100644
index 57be1b1c90c..00000000000
--- a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-cimport cudf._lib.cpp.io.types as cudf_io_types
-
-
-cdef extern from "cudf/io/orc_metadata.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass raw_orc_statistics:
-        vector[string] column_names
-        vector[string] file_stats
-        vector[vector[string]] stripes_stats
-
-    cdef raw_orc_statistics read_raw_orc_statistics(
-        cudf_io_types.source_info src_info
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd b/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
deleted file mode 100644
index 9be38f26237..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-
-
-cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
deleted file mode 100644
index c3e15dd203c..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/explode.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
-    cdef unique_ptr[table] explode_outer(
-        const table_view,
-        size_type explode_column_idx,
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd
deleted file mode 100644
index 384f0f0ef42..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-
-
-cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
deleted file mode 100644
index 8c878ada097..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-
-
-cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] findall(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/strip.pxd b/python/cudf/cudf/_lib/cpp/strings/strip.pxd
deleted file mode 100644
index 3a86f80328f..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/strip.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-
-
-cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] strip(
-        column_view source_strings,
-        side_type stype,
-        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd b/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
deleted file mode 100644
index 62c791799ad..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] wrap(
-        column_view source_strings,
-        size_type width) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 0f0bc3ce81a..aa771295607 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -7,9 +7,9 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.types cimport data_type
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 from cudf._lib.types cimport dtype_to_data_type
 
 import numpy as np
@@ -18,7 +18,7 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 import errno
 import os
@@ -29,22 +29,22 @@ from io import BytesIO, StringIO
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.cpp.io.csv cimport (
+from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
     csv_writer_options,
     read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     sink_info,
     source_info,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
@@ -434,7 +434,7 @@ def read_csv(
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                if cudf.api.types._is_categorical_dtype(v):
+                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
                     df._data[str(k)] = df._data[str(k)].astype(v)
         elif (
             cudf.api.types.is_scalar(dtype) or
@@ -442,11 +442,11 @@ def read_csv(
                 np.dtype, pd.api.extensions.ExtensionDtype, type
             ))
         ):
-            if cudf.api.types._is_categorical_dtype(dtype):
+            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
                 df = df.astype(dtype)
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
-                if cudf.api.types._is_categorical_dtype(col_dtype):
+                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
@@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
-    if cudf.api.types._is_categorical_dtype(dtype):
-        if isinstance(dtype, str):
-            dtype = "str"
-        else:
-            dtype = dtype.categories.dtype
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 009a69ea501..b30ef875a7b 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -7,13 +7,13 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-cimport cudf._lib.cpp.datetime as libcudf_datetime
+cimport cudf._lib.pylibcudf.libcudf.datetime as libcudf_datetime
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.filling cimport calendrical_month_sequence
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.filling cimport calendrical_month_sequence
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
index c2ee504c626..4a20c5fc545 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -3,13 +3,13 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.expressions cimport (
+from cudf._lib.pylibcudf.libcudf.expressions cimport (
     column_reference,
     expression,
     literal,
     operation,
 )
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     numeric_scalar,
     scalar,
     string_scalar,
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index a3b07075507..3fb29279ed7 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -10,9 +10,12 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.cpp.wrappers.timestamps cimport timestamp_ms, timestamp_us
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_us,
+)
 
 # Necessary for proper casting, see below.
 ctypedef int32_t underlying_type_ast_operator
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index d5e97439180..9d18e023fe8 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -3,7 +3,7 @@ from functools import singledispatch
 
 from pandas.errors import DataError
 
-from cudf.api.types import is_string_dtype
+from cudf.api.types import _is_categorical_dtype, is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -18,8 +18,8 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from cudf._lib import pylibcudf
 from cudf._lib.aggregation import make_aggregation
@@ -167,6 +167,46 @@ cdef class GroupBy:
             included_aggregations_i = []
             col_aggregations = []
             for agg in aggs:
+                str_agg = str(agg)
+                if (
+                    is_string_dtype(col)
+                    and agg not in _STRING_AGGS
+                    and
+                    (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or not (
+                        any(a in str_agg for a in {
+                            "count",
+                            "max",
+                            "min",
+                            "first",
+                            "last",
+                            "nunique",
+                            "unique",
+                            "nth"
+                        })
+                        or (agg is list)
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"function is not supported for this dtype: {agg}"
+                    )
+                elif (
+                    _is_categorical_dtype(col)
+                    and agg not in _CATEGORICAL_AGGS
+                    and (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or
+                        not (
+                            any(a in str_agg for a in {"count", "max", "min", "unique"})
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"{col.dtype} type does not support {agg} operations"
+                    )
+
                 agg_obj = make_aggregation(agg)
                 if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
                     included_aggregations_i.append((agg, agg_obj.kind))
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 6854cff7763..b8331d5a226 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -7,10 +7,10 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.hash cimport (
     md5,
     murmurhash3_x86_32,
     sha1,
@@ -20,9 +20,11 @@ from cudf._lib.cpp.hash cimport (
     sha512,
     xxhash_64,
 )
-from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    hash_partition as cpp_hash_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 0afed1bbd2e..37595b65e65 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 
 from cudf._lib import pylibcudf
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     DLManagedTensor,
     from_dlpack as cpp_from_dlpack,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index bd5bf0227a5..a0a9c3fa0d4 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index 5cadd58d8d3..aa7fa0efdaf 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 2c2d52b512b..252d986843a 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,11 +1,15 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
+    sink_info,
+    source_info,
+)
 
 
 cdef source_info make_source_info(list src) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index ae978d18813..3c14ec46122 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     sink_info,
     source_info,
 )
-from cudf._lib.io.datasource cimport Datasource
 
 import codecs
 import errno
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 9bbad0f61c3..283a451dd4a 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-# cython: boundscheck = False
-
 import io
 import os
 from collections import abc
@@ -16,30 +14,30 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.json cimport (
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
     write_json as libcudf_write_json,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     compression_type,
     sink_info,
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
@@ -51,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object byte_range,
                 bool legacy,
                 bool keep_quotes,
-                bool mixed_types_as_string):
+                bool mixed_types_as_string,
+                bool prune_columns):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -130,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 2c2538ab0af..439a727a9ca 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.labeling cimport inclusive, label_bins as cpp_label_bins
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.labeling cimport (
+    inclusive,
+    label_bins as cpp_label_bins,
+)
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index f4d16967300..656d92c1a4b 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -7,24 +7,33 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
-from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
-from cudf._lib.cpp.lists.count_elements cimport (
+from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
+    contains,
+    index_of as cpp_index_of,
+)
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists
-from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+)
+from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+    distinct as cpp_distinct,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 1f98140d9e4..b00deae2270 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -11,8 +11,8 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
     bitmask_or as cpp_bitmask_or,
@@ -20,8 +20,8 @@ from cudf._lib.cpp.null_mask cimport (
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
 from cudf._lib.utils cimport table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index cfc76afa8a5..d60162d0656 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 from cudf.core.buffer import acquire_spill_lock
@@ -7,14 +7,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index 984c8e84d7c..514b6610575 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.edit_distance cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance as cpp_edit_distance,
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 96b95c8792d..a6b9a1e4f7a 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,15 +6,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.generate_ngrams cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
     hash_character_ngrams as cpp_hash_character_ngrams,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 9035e743fa5..42fe15d6869 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,10 +6,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.jaccard cimport jaccard_index as cpp_jaccard_index
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 6ed5ca834ee..4c92999e190 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.minhash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 3e7911c8ae8..ccd8de8c96f 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.ngrams_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
     ngrams_tokenize as cpp_ngrams_tokenize,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 80c6ef792ab..9f81f865bb7 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.normalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_characters as cpp_normalize_characters,
     normalize_spaces as cpp_normalize_spaces,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 289e5611010..ce2edc58d19 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.replace cimport (
     filter_tokens as cpp_filter_tokens,
     replace_tokens as cpp_replace_tokens,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index c690aba70de..8f75953ae99 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from enum import IntEnum
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.stemmer cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
     letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 class LetterType(IntEnum):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index bf675a16adc..1112667a087 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 
@@ -10,8 +10,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.subword_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
     load_vocabulary_file as cpp_load_vocabulary_file,
     move as tr_move,
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index bee9d6f6c4d..98afd94ab1c 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
@@ -17,8 +17,8 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     tokenize_vocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 3fc9823b914..d3e6053ef4b 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -11,19 +11,26 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+import datetime
 from collections import OrderedDict
 
-cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 
 try:
     import ujson as json
 except ImportError:
     import json
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.orc cimport (
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_column_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
     orc_reader_options,
@@ -31,11 +38,22 @@ from cudf._lib.cpp.io.orc cimport (
     read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from cudf._lib.cpp.io.orc_metadata cimport (
-    raw_orc_statistics,
-    read_raw_orc_statistics as libcudf_read_raw_orc_statistics,
+from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    parsed_orc_statistics,
+    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
@@ -43,14 +61,9 @@ from cudf._lib.cpp.io.types cimport (
     table_input_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
@@ -62,9 +75,128 @@ from pyarrow.lib import NativeFile
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 
-cpdef read_raw_orc_statistics(filepath_or_buffer):
+cdef _parse_column_type_statistics(column_statistics stats):
+    # Initialize stats to return and parse stats blob
+    column_stats = {}
+
+    if stats.number_of_values.has_value():
+        column_stats["number_of_values"] = stats.number_of_values.value()
+
+    if stats.has_null.has_value():
+        column_stats["has_null"] = stats.has_null.value()
+
+    cdef statistics_type type_specific_stats = stats.type_specific_stats
+
+    cdef integer_statistics* int_stats
+    cdef double_statistics* dbl_stats
+    cdef string_statistics* str_stats
+    cdef bucket_statistics* bucket_stats
+    cdef decimal_statistics* dec_stats
+    cdef date_statistics* date_stats
+    cdef binary_statistics* bin_stats
+    cdef timestamp_statistics* ts_stats
+
+    if holds_alternative[no_statistics](type_specific_stats):
+        return column_stats
+    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
+        if int_stats.minimum.has_value():
+            column_stats["minimum"] = int_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if int_stats.maximum.has_value():
+            column_stats["maximum"] = int_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if int_stats.sum.has_value():
+            column_stats["sum"] = int_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
+        if dbl_stats.minimum.has_value():
+            column_stats["minimum"] = dbl_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if dbl_stats.maximum.has_value():
+            column_stats["maximum"] = dbl_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if dbl_stats.sum.has_value():
+            column_stats["sum"] = dbl_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
+        if str_stats.minimum.has_value():
+            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if str_stats.maximum.has_value():
+            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if str_stats.sum.has_value():
+            column_stats["sum"] = str_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
+        column_stats["true_count"] = bucket_stats.count[0]
+        column_stats["false_count"] = (
+            column_stats["number_of_values"]
+            - column_stats["true_count"]
+        )
+    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
+        if dec_stats.minimum.has_value():
+            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if dec_stats.maximum.has_value():
+            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if dec_stats.sum.has_value():
+            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+        else:
+            column_stats["sum"] = None
+    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
+        if date_stats.minimum.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["minimum"] = None
+        if date_stats.maximum.has_value():
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["maximum"] = None
+    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
+        if bin_stats.sum.has_value():
+            column_stats["sum"] = bin_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
+        # Before ORC-135, the local timezone offset was included and they were
+        # stored as minimum and maximum. After ORC-135, the timestamp is
+        # adjusted to UTC before being converted to milliseconds and stored
+        # in minimumUtc and maximumUtc.
+        # TODO: Support minimum and maximum by reading writer's local timezone
+        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+            )
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+            )
+    else:
+        raise ValueError("Unsupported statistics type")
+    return column_stats
+
+
+cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
-    Cython function to call into libcudf API, see `read_raw_orc_statistics`.
+    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
 
     See Also
     --------
@@ -75,10 +207,25 @@ cpdef read_raw_orc_statistics(filepath_or_buffer):
     if isinstance(filepath_or_buffer, NativeFile):
         filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
 
-    cdef raw_orc_statistics raw = (
-        libcudf_read_raw_orc_statistics(make_source_info([filepath_or_buffer]))
+    cdef parsed_orc_statistics parsed = (
+        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
     )
-    return (raw.column_names, raw.file_stats, raw.stripes_stats)
+
+    cdef vector[column_statistics] file_stats = parsed.file_stats
+    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
+
+    parsed_file_stats = [
+        _parse_column_type_statistics(file_stats[column_index])
+        for column_index in range(file_stats.size())
+    ]
+
+    parsed_stripes_stats = [
+        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
+         for column_index in range(stripes_stats[stripe_index].size())]
+        for stripe_index in range(stripes_stats.size())
+    ]
+
+    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
@@ -325,11 +472,11 @@ cdef int64_t get_skiprows_arg(object arg) except*:
         raise TypeError("skiprows must be an int >= 0")
     return <int64_t> arg
 
-cdef size_type get_num_rows_arg(object arg) except*:
+cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
-    return <size_type> arg
+    return <int64_t> arg
 
 
 cdef orc_reader_options make_orc_reader_options(
@@ -337,7 +484,7 @@ cdef orc_reader_options make_orc_reader_options(
     object column_names,
     object stripes,
     int64_t skip_rows,
-    size_type num_rows,
+    int64_t num_rows,
     type_id timestamp_type,
     bool use_index
 ) except*:
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d3f5b423373..f0eef9be124 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-# cython: boundscheck = False
-
 import io
 
 import pyarrow as pa
@@ -33,12 +31,19 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.types as cudf_types
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.io.parquet cimport (
+from cudf._lib.expressions cimport Expression
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sinks_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -48,16 +53,16 @@ from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
-from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.expressions cimport Expression
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sinks_info,
-    make_source_info,
-    update_struct_field_names,
+from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
+    parquet_metadata,
+    read_parquet_metadata as parquet_metadata_reader,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_in_metadata,
+    table_input_metadata,
 )
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
@@ -165,6 +170,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
         .use_pandas_metadata(cpp_use_pandas_metadata)
+        .use_arrow_schema(True)
         .timestamp_type(cpp_timestamp_type)
     )
     if filters is not None:
@@ -318,6 +324,71 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         df._data.label_dtype = cudf.dtype(column_index_type)
     return df
 
+cpdef read_parquet_metadata(filepaths_or_buffers):
+    """
+    Cython function to call into libcudf API, see `read_parquet_metadata`.
+
+    See Also
+    --------
+    cudf.io.parquet.read_parquet
+    cudf.io.parquet.to_parquet
+    """
+    # Convert NativeFile buffers to NativeFileDatasource
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+
+    cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers)
+
+    args = move(source)
+
+    cdef parquet_metadata c_result
+
+    # Read Parquet metadata
+    with nogil:
+        c_result = move(parquet_metadata_reader(args))
+
+    # access and return results
+    num_rows = c_result.num_rows()
+    num_rowgroups = c_result.num_rowgroups()
+
+    # extract row group metadata and sanitize keys
+    row_group_metadata = [{k.decode(): v for k, v in metadata}
+                          for metadata in c_result.rowgroup_metadata()]
+
+    # read all column names including index column, if any
+    col_names = [info.name().decode() for info in c_result.schema().root().children()]
+
+    # access the Parquet file_footer to find the index
+    index_col = None
+    cdef unordered_map[string, string] file_footer = c_result.metadata()
+
+    # get index column name(s)
+    index_col_names = None
+    json_str = file_footer[b'pandas'].decode('utf-8')
+    meta = None
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if index_col_names is not None:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    # num_columns = length of list(col_names)
+    num_columns = len(col_names)
+
+    # return the metadata
+    return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata
+
 
 @acquire_spill_lock()
 def write_parquet(
@@ -332,10 +403,15 @@ def write_parquet(
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
+    object max_dictionary_size=None,
     object partitions_info=None,
     object force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -386,7 +462,12 @@ def write_parquet(
         _set_col_metadata(
             table[name]._column,
             tbl_meta.column_metadata[i],
-            force_nullable_schema
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
 
     cdef map[string, string] tmp_user_data
@@ -411,7 +492,7 @@ def write_parquet(
         )
 
     dict_policy = (
-        cudf_io_types.dictionary_policy.ALWAYS
+        cudf_io_types.dictionary_policy.ADAPTIVE
         if use_dictionary
         else cudf_io_types.dictionary_policy.NEVER
     )
@@ -461,6 +542,8 @@ def write_parquet(
         args.set_max_page_size_bytes(max_page_size_bytes)
     if max_page_size_rows is not None:
         args.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        args.set_max_dictionary_size(max_dictionary_size)
 
     with nogil:
         out_metadata_c = move(parquet_writer(args))
@@ -504,7 +587,14 @@ cdef class ParquetWriter:
     max_page_size_rows: int, default 20000
         Maximum number of rows of each page of the output.
         By default, 20000 will be used.
-
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -521,13 +611,17 @@ cdef class ParquetWriter:
     cdef size_type row_group_size_rows
     cdef size_t max_page_size_bytes
     cdef size_type max_page_size_rows
+    cdef size_t max_dictionary_size
+    cdef cudf_io_types.dictionary_policy dict_policy
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000):
+                  int max_page_size_rows=20000,
+                  int max_dictionary_size=1048576,
+                  bool use_dictionary=True):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -542,6 +636,12 @@ cdef class ParquetWriter:
         self.row_group_size_rows = row_group_size_rows
         self.max_page_size_bytes = max_page_size_bytes
         self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.dict_policy = (
+            cudf_io_types.dictionary_policy.ADAPTIVE
+            if use_dictionary
+            else cudf_io_types.dictionary_policy.NEVER
+        )
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -659,8 +759,10 @@ cdef class ParquetWriter:
                 .row_group_size_rows(self.row_group_size_rows)
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
+                .max_dictionary_size(self.max_dictionary_size)
                 .build()
             )
+            args.set_dictionary_policy(self.dict_policy)
             self.writer.reset(new cpp_parquet_chunked_writer(args))
         self.initialized = True
 
@@ -717,16 +819,62 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
+cdef cudf_io_types.column_encoding _get_encoding_type(object encoding):
+    if encoding is None:
+        return cudf_io_types.column_encoding.USE_DEFAULT
+
+    enc = str(encoding).upper()
+    if enc == "PLAIN":
+        return cudf_io_types.column_encoding.PLAIN
+    elif enc == "DICTIONARY":
+        return cudf_io_types.column_encoding.DICTIONARY
+    elif enc == "DELTA_BINARY_PACKED":
+        return cudf_io_types.column_encoding.DELTA_BINARY_PACKED
+    elif enc == "DELTA_LENGTH_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY
+    elif enc == "DELTA_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY
+    elif enc == "BYTE_STREAM_SPLIT":
+        return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT
+    elif enc == "USE_DEFAULT":
+        return cudf_io_types.column_encoding.USE_DEFAULT
+    else:
+        raise ValueError("Unsupported `column_encoding` type")
+
+
 cdef _set_col_metadata(
     Column col,
     column_in_metadata& col_meta,
     bool force_nullable_schema=False,
+    str path=None,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
+    need_path = (skip_compression is not None or column_encoding is not None or
+                 column_type_length is not None or output_as_binary is not None)
+    name = col_meta.get_name().decode('UTF-8') if need_path else None
+    full_path = path + "." + name if path is not None else name
+
     if force_nullable_schema:
         # Only set nullability if `force_nullable_schema`
         # is true.
         col_meta.set_nullability(True)
 
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        col_meta.set_encoding(_get_encoding_type(column_encoding[full_path]))
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
     if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
@@ -735,13 +883,26 @@ cdef _set_col_metadata(
             _set_col_metadata(
                 child_col,
                 col_meta.child(i),
-                force_nullable_schema
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary
             )
     elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element".encode())
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
-            force_nullable_schema
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
     elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
         col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 4bf8b32ea7e..708ec4174aa 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,16 +8,18 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.partitioning cimport partition as cpp_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    partition as cpp_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 81d15cf95b4..efc978fc6d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -44,3 +44,6 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 link_to_pyarrow_headers(pylibcudf_interop)
+
+add_subdirectory(libcudf)
+add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 48c23a9dd4c..5adefa5fd93 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@ __all__ = [
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 8ccb0ecc341..89f874f5fa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -17,6 +17,7 @@
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
index a9491793b88..8526728656b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
@@ -2,7 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
@@ -14,7 +14,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index fe7daea38bf..7bb64e32a1b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -5,7 +5,7 @@ from libcpp.cast cimport dynamic_cast
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
     groupby_aggregation,
@@ -39,7 +39,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
@@ -49,14 +49,16 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.aggregation import Kind  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_percentage as RankPercentage  # no-cython-lint
-from cudf._lib.cpp.aggregation import udf_type as UdfType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import (  # no-cython-lint
+    udf_type as UdfType,
+)
 
 from .types cimport DataType
 
@@ -77,6 +79,14 @@ cdef class Aggregation:
             "Aggregations should not be constructed directly. Use one of the factories."
         )
 
+    def __eq__(self, other):
+        return type(self) is type(other) and (
+            dereference(self.c_obj).is_equal(dereference((<Aggregation>other).c_obj))
+        )
+
+    def __hash__(self):
+        return dereference(self.c_obj).do_hash()
+
     # TODO: Ideally we would include the return type here, but we need to do so
     # in a way that Sphinx understands (currently have issues due to
     # https://github.com/cython/cython/issues/5609).
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 0aa6aac7b39..9a8c8e49dcf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index 16de7757469..c1d669c3c1c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -5,11 +5,11 @@ from cython.operator import dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport binaryop as cpp_binaryop
-from cudf._lib.cpp.binaryop cimport binary_operator
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport binaryop as cpp_binaryop
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.binaryop import \
+from cudf._lib.pylibcudf.libcudf.binaryop import \
     binary_operator as BinaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index fc5cc77c9e7..e121e856865 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -3,9 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport bitmask_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
@@ -43,6 +46,7 @@ cdef class Column:
     cpdef gpumemoryview data(self)
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
+    cpdef Column copy(self)
 
     cpdef ListColumnView list_view(self)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 3c5c53f99cf..e726eca154f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -1,21 +1,27 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport make_column_from_scalar
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+    make_column_from_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
 from .types cimport DataType, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
+import functools
+
+import numpy as np
+
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -223,6 +229,51 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def from_cuda_array_interface_obj(object obj):
+        """Create a Column from an object with a CUDA array interface.
+
+        Parameters
+        ----------
+        obj : object
+            The object with the CUDA array interface to create a column from.
+
+        Returns
+        -------
+        Column
+            A Column containing the data from the CUDA array interface.
+
+        Notes
+        -----
+        Data is not copied when creating the column. The caller is
+        responsible for ensuring the data is not mutated unexpectedly while the
+        column is in use.
+        """
+        data = gpumemoryview(obj)
+        iface = data.__cuda_array_interface__()
+        if iface.get('mask') is not None:
+            raise ValueError("mask not yet supported.")
+
+        typestr = iface['typestr'][1:]
+        if not is_c_contiguous(
+            iface['shape'],
+            iface['strides'],
+            np.dtype(typestr).itemsize
+        ):
+            raise ValueError("Data must be C-contiguous")
+
+        data_type = _datatype_from_dtype_desc(typestr)
+        size = iface['shape'][0]
+        return Column(
+            data_type,
+            size,
+            data,
+            None,
+            0,
+            0,
+            []
+        )
+
     cpdef DataType type(self):
         """The type of data in the column."""
         return self._data_type
@@ -274,6 +325,13 @@ cdef class Column:
         """The children of the column."""
         return self._children
 
+    cpdef Column copy(self):
+        """Create a copy of the column."""
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_unique[column](self.view()))
+        return Column.from_libcudf(move(c_result))
+
 
 cdef class ListColumnView:
     """Accessor for methods of a Column that are specific to lists."""
@@ -289,3 +347,61 @@ cdef class ListColumnView:
     cpdef offsets(self):
         """The offsets column of the underlying list column."""
         return self._column.child(1)
+
+
+@functools.cache
+def _datatype_from_dtype_desc(desc):
+    mapping = {
+        'u1': type_id.UINT8,
+        'u2': type_id.UINT16,
+        'u4': type_id.UINT32,
+        'u8': type_id.UINT64,
+        'i1': type_id.INT8,
+        'i2': type_id.INT16,
+        'i4': type_id.INT32,
+        'i8': type_id.INT64,
+        'f4': type_id.FLOAT32,
+        'f8': type_id.FLOAT64,
+        'b1': type_id.BOOL8,
+        'M8[s]': type_id.TIMESTAMP_SECONDS,
+        'M8[ms]': type_id.TIMESTAMP_MILLISECONDS,
+        'M8[us]': type_id.TIMESTAMP_MICROSECONDS,
+        'M8[ns]': type_id.TIMESTAMP_NANOSECONDS,
+        'm8[s]': type_id.DURATION_SECONDS,
+        'm8[ms]': type_id.DURATION_MILLISECONDS,
+        'm8[us]': type_id.DURATION_MICROSECONDS,
+        'm8[ns]': type_id.DURATION_NANOSECONDS,
+    }
+    if desc not in mapping:
+        raise ValueError(f"Unsupported dtype: {desc}")
+    return DataType(mapping[desc])
+
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool:
+    """Determine if shape and strides are C-contiguous
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Number of elements in each dimension.
+    strides : Sequence[int]
+        The stride of each dimension in bytes.
+    itemsize : int
+        Size of an element in bytes.
+
+    Return
+    ------
+    bool
+        The boolean answer.
+    """
+
+    if any(dim == 0 for dim in shape):
+        return True
+    cumulative_stride = itemsize
+    for dim, stride in zip(reversed(shape), reversed(strides)):
+        if dim > 1 and stride != cumulative_stride:
+            return False
+        cumulative_stride *= dim
+    return True
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
index ce7ef84e20e..5e40f921b2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport concatenate as cpp_concatenate
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf cimport concatenate as cpp_concatenate
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
index 7b5f1e70ea3..06543d3ca92 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -2,8 +2,11 @@
 
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -58,12 +61,12 @@ cpdef Column copy_range(
     size_type target_begin,
 )
 
-cpdef Column shift(Column input, size_type offset, Scalar fill_values)
-
-cpdef list split(ColumnOrTable input, list splits)
+cpdef Column shift(Column input, size_type offset, Scalar fill_value)
 
 cpdef list slice(ColumnOrTable input, list indices)
 
+cpdef list split(ColumnOrTable input, list splits)
+
 cpdef Column copy_if_else(
     LeftCopyIfElseOperand lhs,
     RightCopyIfElseOperand rhs,
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index d78955dc325..2d59deb3864 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -11,18 +11,24 @@ from libcpp.vector cimport vector
 # directly from that. It will make namespacing much cleaner in pylibcudf. What
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
-from cudf._lib.cpp cimport copying as cpp_copying
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf cimport copying as cpp_copying
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.pylibcudf.libcudf.copying import \
     mask_allocation_policy as MaskAllocationPolicy  # no-cython-lint
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 
 from .column cimport Column
@@ -54,6 +60,11 @@ cpdef Table gather(
     -------
     pylibcudf.Table
         The result of the gather
+
+    Raises
+    ------
+    ValueError
+        If the gather_map contains nulls.
     """
     cdef unique_ptr[table] c_result
     with nogil:
@@ -92,6 +103,20 @@ cpdef Table scatter(
     -------
     Table
         The result of the scatter
+
+    Raises
+    ------
+    ValueError
+        If any of the following occur:
+            - scatter_map contains null values.
+            - source is a Table and the number of columns in source does not match the
+              number of columns in target.
+            - source is a Table and the number of rows in source does not match the
+              number of elements in scatter_map.
+            - source is a List[Scalar] and the number of scalars does not match the
+              number of columns in target.
+    TypeError
+        If data types of the source and target columns do not match.
     """
     cdef unique_ptr[table] c_result
     cdef vector[reference_wrapper[const scalar]] source_scalars
@@ -207,6 +232,17 @@ cpdef Column copy_range_in_place(
         The index of the last element in input_column to copy.
     target_begin : int
         The index of the first element in target_column to overwrite.
+
+    Raises
+    ------
+    TypeError
+        If the operation is attempted on non-fixed width types since those would require
+        memory reallocations, or if the input and target columns have different types.
+    IndexError
+        If the indices accessed by the ranges implied by input_begin, input_end, and
+        target_begin are out of bounds.
+    ValueError
+        If source has null values and target is not nullable.
     """
 
     # Need to initialize this outside the function call so that Cython doesn't
@@ -251,6 +287,14 @@ cpdef Column copy_range(
     -------
     pylibcudf.Column
         A copy of target_column with the specified range overwritten.
+
+    Raises
+    ------
+    IndexError
+        If the indices accessed by the ranges implied by input_begin, input_end, and
+        target_begin are out of bounds.
+    TypeError
+        If target and source have different types.
     """
     cdef unique_ptr[column] c_result
 
@@ -266,7 +310,7 @@ cpdef Column copy_range(
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column shift(Column input, size_type offset, Scalar fill_values):
+cpdef Column shift(Column input, size_type offset, Scalar fill_value):
     """Shift the elements of input by offset.
 
     For details on the implementation, see :cpp:func:`shift`.
@@ -285,6 +329,12 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values):
     -------
     pylibcudf.Column
         A copy of input shifted by offset.
+
+    Raises
+    ------
+    TypeError
+        If the fill_value is not of the same type as input, or if the input type is not
+        of fixed width or string type.
     """
     cdef unique_ptr[column] c_result
     with nogil:
@@ -292,37 +342,44 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values):
             cpp_copying.shift(
                 input.view(),
                 offset,
-                dereference(fill_values.c_obj)
+                dereference(fill_value.c_obj)
             )
         )
     return Column.from_libcudf(move(c_result))
 
 
-cpdef list split(ColumnOrTable input, list splits):
-    """Split input into multiple.
+cpdef list slice(ColumnOrTable input, list indices):
+    """Slice input according to indices.
 
-    For details on the implementation, see :cpp:func:`split`.
+    For details on the implementation, see :cpp:func:`slice`.
 
     Parameters
     ----------
-    input : Union[Column, Table]
-        The column to split.
-    splits : List[int]
-        The indices at which to split the column.
+    input_column : Union[Column, Table]
+        The column or table to slice.
+    indices : List[int]
+        The indices to select from input.
 
     Returns
     -------
     List[Union[Column, Table]]
-        The result of splitting input.
+        The result of slicing ``input``.
+
+    Raises
+    ------
+    ValueError
+        If indices size is not even or the values in any pair of lower/upper bounds are
+        strictly decreasing.
+    IndexError
+        When any of the indices don't belong to the range ``[0, input_column.size())``.
     """
-    cdef vector[size_type] c_splits = splits
+    cdef vector[size_type] c_indices = indices
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
-
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.split(input.view(), c_splits))
+            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -330,7 +387,7 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
+            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -338,30 +395,31 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
 
 
-cpdef list slice(ColumnOrTable input, list indices):
-    """Slice input according to indices.
+cpdef list split(ColumnOrTable input, list splits):
+    """Split input into multiple.
 
-    For details on the implementation, see :cpp:func:`slice`.
+    For details on the implementation, see :cpp:func:`split`.
 
     Parameters
     ----------
-    input_column : Union[Column, Table]
-        The column or table to slice.
-    indices : List[int]
-        The indices to select from input.
+    input : Union[Column, Table]
+        The column to split.
+    splits : List[int]
+        The indices at which to split the column.
 
     Returns
     -------
     List[Union[Column, Table]]
-        The result of slicing ``input``.
+        The result of splitting input.
     """
-    cdef vector[size_type] c_indices = indices
+    cdef vector[size_type] c_splits = splits
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
+
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_col_result = move(cpp_copying.split(input.view(), c_splits))
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -369,7 +427,7 @@ cpdef list slice(ColumnOrTable input, list indices):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -401,6 +459,15 @@ cpdef Column copy_if_else(
     -------
     pylibcudf.Column
         The result of copying elements from lhs and rhs according to boolean_mask.
+
+    Raises
+    ------
+    TypeError
+        If lhs and rhs are not of the same type or if the boolean mask is not of type
+        bool.
+    ValueError
+        If boolean mask is not of the same length as lhs and rhs (whichever are
+        columns), or if lhs and rhs are not of the same length (if both are columns).
     """
     cdef unique_ptr[column] result
 
@@ -459,6 +526,16 @@ cpdef Table boolean_mask_scatter(
     -------
     Table
         The result of the scatter
+
+    Raises
+    ------
+    ValueError
+        If input.num_columns() != target.num_columns(), boolean_mask.size() !=
+        target.num_rows(), or if input is a Table and the number of `true` in
+        `boolean_mask` > input.num_rows().
+    TypeError
+        If any input type does not match the corresponding target column's type, or
+        if boolean_mask.type() is not bool.
     """
     cdef unique_ptr[table] result
     cdef vector[reference_wrapper[const scalar]] source_scalars
@@ -502,6 +579,11 @@ cpdef Scalar get_element(Column input_column, size_type index):
     -------
     pylibcudf.Scalar
         The element at index from input_column.
+
+    Raises
+    ------
+    IndexError
+        If index is out of bounds.
     """
     cdef unique_ptr[scalar] c_output
     with nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
index 55dbd7b075f..3560ebf2ea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pyx b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
index 588ab58a146..05f67681428 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
@@ -4,15 +4,15 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.filling cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.filling cimport (
     fill as cpp_fill,
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index f1b7a25d5f9..c6c146b0445 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -4,18 +4,18 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     scan_request,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 3b800abf266..46fe61025ce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -7,17 +7,17 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     groups,
     scan_request,
 )
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index e7471033fc8..f172080cece 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -11,14 +11,17 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     column_metadata,
     from_arrow as cpp_from_arrow,
     to_arrow as cpp_to_arrow,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.wrappers.decimals cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    fixed_point_scalar,
+    scalar,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
     decimal32,
     decimal64,
     decimal128,
@@ -140,6 +143,7 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 
 
 @from_arrow.register(pa.Array)
+@from_arrow.register(pa.ChunkedArray)
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index ff7dec97596..f560eeef06d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport null_equality
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 3710a84e594..cf2a6a8187f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -7,9 +7,14 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp cimport join as cpp_join
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport data_type, null_equality, size_type, type_id
+from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    null_equality,
+    size_type,
+    type_id,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/CMakeLists.txt
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/cpp/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/cpp/aggregation.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index 91b9d7d024f..8c14bc45723 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -1,11 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from libc.stddef cimport size_t
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     nan_equality,
@@ -51,6 +52,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
     cdef cppclass aggregation:
         Kind kind
         unique_ptr[aggregation] clone()
+        size_t do_hash() noexcept
+        bool is_equal(const aggregation const) noexcept
 
     cdef cppclass rolling_aggregation(aggregation):
         pass
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/aggregation.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/binaryop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 735216e656a..0eda7d34ff9 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -4,10 +4,10 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
@@ -29,6 +29,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_NOT_EQUALS
         BITWISE_AND
         BITWISE_OR
         BITWISE_XOR
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/binaryop.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/column/column.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/cpp/column/column.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
index 136f1d795a9..dd184d31cc6 100644
--- a/python/cudf/cudf/_lib/cpp/column/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,11 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/column/column_factories.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
index 0f22e788bd7..fd22d92cb30 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/cpp/column/column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
index edd013d9340..c6403babe89 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
@@ -1,9 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/concatenate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/concatenate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
index a64c7426f5e..0c362390ff2 100644
--- a/python/cudf/cudf/_lib/cpp/concatenate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
@@ -5,9 +5,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table, table_view
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/contiguous_split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
index 134e4ed0723..b06feacb016 100644
--- a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,8 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/copying.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index f3e5c0aec72..001489d69bf 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
@@ -8,13 +8,16 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 ctypedef const scalar constscalar
 
@@ -33,19 +36,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -54,22 +57,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
@@ -77,7 +80,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
@@ -85,68 +88,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE
diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/copying.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/cpp/datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
index d03587745e1..7db77b9c7c5 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/cpp/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 291afbcc62a..279d969db50 100644
--- a/python/cudf/cudf/_lib/cpp/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,16 +1,16 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/filling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
index e412f294537..16ed682f930 100644
--- a/python/cudf/cudf/_lib/cpp/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
@@ -1,14 +1,17 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/groupby.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
index 8bbefcde0dd..16607cc3711 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
@@ -6,24 +6,24 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
     size_type,
     sorted,
 )
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/hash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
index d55e244dc2c..5346252df69 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/interop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 88e9d83ee98..471b78505fb 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
@@ -7,9 +7,9 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "dlpack/dlpack.h" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
index 4aef4841844..1d2138f8d10 100644
--- a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
 
 
 cdef extern from "cudf/io/arrow_io_source.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/io/avro.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
index 9b683e5bce3..530df5aa8f1 100644
--- a/python/cudf/cudf/_lib/cpp/io/avro.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/avro.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/cpp/io/csv.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
index e7c0fec2e3d..b5ff6558cd8 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/csv.hpp" \
@@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \
         cudf_io_types.quote_style get_quoting() except +
         char get_quotechar() except +
         bool is_enabled_doublequote() except +
+        bool is_enabled_updated_quotes_detection() except +
         vector[string] get_parse_dates_names() except +
         vector[int] get_parse_dates_indexes() except +
         vector[string] get_parse_hex_names() except +
@@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_quoting(cudf_io_types.quote_style style) except +
         void set_quotechar(char val) except +
         void set_doublequote(bool val) except +
+        void set_detect_whitespace_around_quotes(bool val) except +
         void set_parse_dates(vector[string]) except +
         void set_parse_dates(vector[int]) except +
         void set_parse_hex(vector[string]) except +
@@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \
         ) except +
         csv_reader_options_builder& quotechar(char val) except +
         csv_reader_options_builder& doublequote(bool val) except +
+        csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except +
         csv_reader_options_builder& parse_dates(vector[string]) except +
         csv_reader_options_builder& parse_dates(vector[int]) except +
 
diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/data_sink.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/datasource.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/cpp/io/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index b916c2b7ad9..7e64a4cae29 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/json.hpp" \
@@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
+        bool is_enabled_prune_columns() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
+        void enable_prune_columns(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& mixed_types_as_string(
             bool val
         ) except +
+        json_reader_options_builder& prune_columns(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/io/orc.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
index d5ac8574fe4..e553515dfdf 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -8,9 +8,9 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/orc.hpp" \
@@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         cudf_io_types.source_info get_source() except +
         vector[vector[size_type]] get_stripes() except +
-        size_type get_skip_rows() except +
-        size_type get_num_rows() except +
+        int64_t get_skip_rows() except +
+        optional[int64_t] get_num_rows() except +
         bool is_enabled_use_index() except +
         bool is_enabled_use_np_dtypes() except +
         data_type get_timestamp_type() except +
@@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         void set_columns(vector[string] col_names) except +
         void set_stripes(vector[vector[size_type]] strps) except +
-        void set_skip_rows(size_type rows) except +
-        void set_num_rows(size_type nrows) except +
+        void set_skip_rows(int64_t rows) except +
+        void set_num_rows(int64_t nrows) except +
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
             stripes(vector[vector[size_type]] strps) except +
-        orc_reader_options_builder& skip_rows(size_type rows) except +
-        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& skip_rows(int64_t rows) except +
+        orc_reader_options_builder& num_rows(int64_t nrows) except +
         orc_reader_options_builder& use_index(bool val) except +
         orc_reader_options_builder& use_np_dtypes(bool val) except +
         orc_reader_options_builder& timestamp_type(data_type type) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
new file mode 100644
index 00000000000..a23655b06f8
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -0,0 +1,73 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.variant cimport monostate, variant
+
+
+cdef extern from "cudf/io/orc_metadata.hpp" \
+        namespace "cudf::io" nogil:
+
+    ctypedef monostate no_statistics
+
+    cdef cppclass minmax_statistics[T]:
+        optional[T] minimum
+        optional[T] maximum
+
+    cdef cppclass sum_statistics[T]:
+        optional[T] sum
+
+    cdef cppclass integer_statistics(
+        minmax_statistics[int64_t], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass double_statistics(
+        minmax_statistics[double], sum_statistics[double]
+    ):
+        pass
+
+    cdef cppclass string_statistics(
+        minmax_statistics[string], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass bucket_statistics:
+        vector[int64_t] count
+
+    cdef cppclass decimal_statistics(
+        minmax_statistics[string], sum_statistics[string]
+    ):
+        pass
+
+    ctypedef minmax_statistics[int32_t] date_statistics
+
+    ctypedef sum_statistics[int64_t] binary_statistics
+
+    cdef cppclass timestamp_statistics(minmax_statistics[int64_t]):
+        optional[int64_t] minimum_utc
+        optional[int64_t] maximum_utc
+        optional[uint32_t] minimum_nanos
+        optional[uint32_t] maximum_nanos
+
+    # This is a std::variant of all the statistics types
+    ctypedef variant statistics_type
+
+    cdef cppclass column_statistics:
+        optional[uint64_t] number_of_values
+        optional[bool] has_null
+        statistics_type type_specific_stats
+
+    cdef cppclass parsed_orc_statistics:
+        vector[string] column_names
+        vector[column_statistics] file_stats
+        vector[vector[column_statistics]] stripes_stats
+
+    cdef parsed_orc_statistics read_parsed_orc_statistics(
+        cudf_io_types.source_info src_info
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/io/parquet.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 8de16d06a9d..33a594b432f 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -9,10 +9,10 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
@@ -23,11 +23,13 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
         bool is_enabled_use_pandas_metadata() except +
+        bool is_enabled_arrow_schema() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -50,6 +52,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
+        parquet_reader_options_builder& use_arrow_schema(
+            bool val
+        ) except +
         parquet_reader_options_builder& timestamp_type(
             data_type type
         ) except +
@@ -74,6 +79,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_partitions(
             vector[cudf_io_types.partition_info] partitions
@@ -103,8 +109,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -155,6 +162,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
@@ -179,6 +189,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -202,8 +213,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -245,6 +257,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        chunked_parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
new file mode 100644
index 00000000000..34a299b73ab
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp.string cimport string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
+    cdef cppclass parquet_column_schema:
+        parquet_column_schema() except+
+        string name() except+
+        size_type num_children() except+
+        parquet_column_schema child(int idx) except+
+        vector[parquet_column_schema] children() except+
+
+    cdef cppclass parquet_schema:
+        parquet_schema() except+
+        parquet_column_schema root() except+
+
+    cdef cppclass parquet_metadata:
+        parquet_metadata() except+
+        parquet_schema schema() except+
+        int64_t num_rows() except+
+        size_type num_rowgroups() except+
+        unordered_map[string, string] metadata() except+
+        vector[unordered_map[string, int64_t]] rowgroup_metadata() except+
+
+    cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/cpp/io/text.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
index 368b014ea4b..bec223d4079 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
 
 cdef extern from "cudf/io/text/byte_range_info.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/io/timezone.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
index 927c2118473..88cb5544dc1 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/io/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index d8cc329b0a0..38fae1df1e5 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -10,11 +10,11 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
@@ -57,6 +57,19 @@ cdef extern from "cudf/io/types.hpp" \
         ADAPTIVE = 1,
         ALWAYS = 2,
 
+    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
+        cpdef enum class column_encoding:
+            USE_DEFAULT = -1
+            DICTIONARY = 0
+            PLAIN = 1
+            DELTA_BINARY_PACKED = 2
+            DELTA_LENGTH_BYTE_ARRAY =3
+            DELTA_BYTE_ARRAY = 4
+            BYTE_STREAM_SPLIT = 5
+            DIRECT = 6
+            DIRECT_V2 = 7
+            DICTIONARY_V2 = 8
+
     cdef cppclass column_name_info:
         string name
         vector[column_name_info] children
@@ -81,6 +94,9 @@ cdef extern from "cudf/io/types.hpp" \
         column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
         column_in_metadata& set_output_as_binary(bool binary)
+        column_in_metadata& set_type_length(int32_t type_length)
+        column_in_metadata& set_skip_compression(bool skip)
+        column_in_metadata& set_encoding(column_encoding enc)
         string get_name()
 
     cdef cppclass table_input_metadata:
diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/cpp/join.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 7508052646a..89a30f0f255 100644
--- a/python/cudf/cudf/_lib/cpp/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -7,10 +7,10 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_uvector cimport device_uvector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_equality, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality, size_type
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/cudf/cudf/_lib/cpp/labeling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/labeling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
index af9c4bb9a04..54731bf29af 100644
--- a/python/cudf/cudf/_lib/cpp/labeling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/cpp/lists/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
index a7ad8e7ba41..728bd840f71 100644
--- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/lists/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index e86c73deed2..721679f35c7 100644
--- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,12 +1,14 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
new file mode 100644
index 00000000000..38bdd4db0bb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
new file mode 100644
index 00000000000..622a866f593
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/lists/extract.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index 93a886d7268..caa12f41914 100644
--- a/python/cudf/cudf/_lib/cpp/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
similarity index 57%
rename from python/cudf/cudf/_lib/cpp/lists/gather.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index ea664eee82e..17b4c1877a6 100644
--- a/python/cudf/cudf/_lib/cpp/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -1,9 +1,11 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index 793f4b8750d..dbafc415e45 100644
--- a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,7 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
similarity index 51%
rename from python/cudf/cudf/_lib/cpp/lists/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
index 2115885ed95..145ab41302f 100644
--- a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 
 cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
index 58c1ab1dcec..22b91df7192 100644
--- a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport nan_equality, null_equality
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/merge.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/merge.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
index 32fe14ac479..dacb3dc2d74 100644
--- a/python/cudf/cudf/_lib/cpp/merge.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/null_mask.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/null_mask.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
index bd0eb684690..0cab404c05f 100644
--- a/python/cudf/cudf/_lib/cpp/null_mask.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
@@ -1,13 +1,17 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    mask_state,
+    size_type,
+)
 
 ctypedef int32_t underlying_type_t_mask_state
 
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index e678e4e84db..033a820d2ef 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
index 11de596ec8f..ca1f6650a5a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 75822054e4a..2034b1c1ee5 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
index a77f95f07ac..789a1a2c35a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
index 08b3330953e..fc5577bf3f9 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index d716df22546..229f4d8f5a3 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
index f012670317a..65c63b089df 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
index c4e5258a710..aaad28d2684 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
index 5a92b45b6dd..040d4c9de63 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index 226fa613f2c..cce40bcd3f6 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
index 3cc3fd6251a..721a6cabd01 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/partitioning.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/partitioning.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
index 5c58dbcc4ac..babb167d2a0 100644
--- a/python/cudf/cudf/_lib/cpp/partitioning.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/quantiles.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
index 03fda16856c..32cfec2d4fc 100644
--- a/python/cudf/cudf/_lib/cpp/quantiles.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     null_order,
     order,
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/reduce.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
index 9c893fe9bcb..3ae1f1a2906 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
@@ -4,11 +4,14 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
 
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/reduce.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
diff --git a/python/cudf/cudf/_lib/cpp/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/cpp/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
index 5d57f01b816..e67efbdaba0 100644
--- a/python/cudf/cudf/_lib/cpp/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
@@ -5,9 +5,12 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/replace.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
diff --git a/python/cudf/cudf/_lib/cpp/reshape.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/reshape.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
index 5b9d40aa2ad..dfd9a71c3d3 100644
--- a/python/cudf/cudf/_lib/cpp/reshape.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/rolling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
index 6b620e3a4c0..d7844f99a73 100644
--- a/python/cudf/cudf/_lib/cpp/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/round.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
index 66d76c35d72..06ff42485ea 100644
--- a/python/cudf/cudf/_lib/cpp/round.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
index b5e9b0ba06b..662eb90096e 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type
-from cudf._lib.cpp.wrappers.decimals cimport scale_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/search.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/search.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
index 8baef0aa1b9..e2247a1366f 100644
--- a/python/cudf/cudf/_lib/cpp/search.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
index 86dc0f0de95..3d7d3aa2790 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
@@ -6,12 +6,12 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
index 55854a9444f..11d803e5b76 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
@@ -6,11 +6,11 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/attributes.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
index 31133b45b6d..c4d52c83663 100644
--- a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
index d193a8265b1..f95d4f35566 100644
--- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/case.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
index 01cd08c10ff..9ccd2737afe 100644
--- a/python/cudf/cudf/_lib/cpp/strings/case.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/strings/char_types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
index ae921c6ead9..408b3687c4a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/strings/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
index 2b10427283f..b05e46af0d6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
index 94c2fb21fc1..f8ed253ff3c 100644
--- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index 96cb43973f1..daac2b5be28 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 5e7380c1d4e..263cee4fe1e 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index 8c54fd52aa2..af357b9bde4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index a993c5b17b8..91c1abdb5e4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 6388f43077d..5fbf2be0244 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index b5443979b81..3d6c59cbfcf 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index d6e881caea4..86de956b6b6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
similarity index 53%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 99bb80a813d..aba2dbcca64 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5d9991dd610..fb7e0cae6de 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
new file mode 100644
index 00000000000..57903ca27de
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+
+
+cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[table] extract(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/cpp/strings/find.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
index 953d5c30b2a..04e2ed554ee 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
@@ -41,6 +41,11 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         size_type start,
         size_type stop) except +
 
+    cdef unique_ptr[column] find(
+        column_view source_strings,
+        column_view target,
+        size_type start) except +
+
     cdef unique_ptr[column] rfind(
         column_view source_strings,
         string_scalar target,
diff --git a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
index 27b19728f60..1f1adc8e99f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
new file mode 100644
index 00000000000..4bc450b8911
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+
+cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] findall(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
index eed627c96b5..5926fa1d29f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/padding.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
similarity index 54%
rename from python/cudf/cudf/_lib/cpp/strings/padding.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
index c3906a5b4c6..26681a1aa00 100644
--- a/python/cudf/cudf/_lib/cpp/strings/padding.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
index 7818c9c7d01..e92c8bd7737 100644
--- a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 
 
 cdef extern from "cudf/strings/regex/regex_program.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/repeat.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
index 2a6754b9a11..9e128529406 100644
--- a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
index 2a9c6913bb3..92e142b33fc 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
index 94f3d0528a5..739505cd51d 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/side_type.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/side_type.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
index fb83512e9f0..5119124b3e3 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/strings/split/partition.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/strings/split/split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
index d6207cd5c76..4f75664e47a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/split/split.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
new file mode 100644
index 00000000000..2d6fd6a9e89
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+
+
+cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] strip(
+        column_view source_strings,
+        side_type stype,
+        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/substring.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/substring.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
index 99ea8c7ff3f..02123cc0807 100644
--- a/python/cudf/cudf/_lib/cpp/strings/substring.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/translate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/translate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
index 3239ba314e4..b23ac277216 100644
--- a/python/cudf/cudf/_lib/cpp/strings/translate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 
 
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
new file mode 100644
index 00000000000..1d92d445634
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] wrap(
+        column_view source_strings,
+        size_type width) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/strings_udf.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
index 7d45bc858f5..b895d5e6925 100644
--- a/python/cudf/cudf/_lib/cpp/strings_udf.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
@@ -7,9 +7,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/table/table.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
index ac93e3def19..737a1327d45 100644
--- a/python/cudf/cudf/_lib/cpp/table/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
@@ -1,11 +1,14 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport (
+    mutable_table_view,
+    table_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/table/table_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
index 728b6d2be4b..00e1a89c025 100644
--- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
@@ -1,9 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/transform.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
index d9de04b676e..b0a978fe5c5 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -7,12 +7,16 @@ from libcpp.string cimport string
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transpose.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/transpose.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
index 8cbfb0055bd..5dcb9c165ad 100644
--- a/python/cudf/cudf/_lib/cpp/transpose.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/cpp/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 13aebdff726..8e94ec296cf 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -88,8 +88,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         data_type(const data_type&) except +
         data_type(type_id id) except +
         data_type(type_id id, int32_t scale) except +
-        type_id id() except +
-        int32_t scale() except +
+        type_id id() noexcept
+        int32_t scale() noexcept
+        bool operator==(const data_type&, const data_type&) noexcept
 
     cpdef enum class interpolation(int32_t):
         LINEAR
diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/types.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
diff --git a/python/cudf/cudf/_lib/cpp/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/unary.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index cc07290b6c4..7f8ae2b7617 100644
--- a/python/cudf/cudf/_lib/cpp/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -3,9 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/unary.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
similarity index 82%
rename from python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
index 858569fd696..09b0c87e4b8 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 
-from cudf._lib.cpp.types cimport int128
+from cudf._lib.pylibcudf.libcudf.types cimport int128
 
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index cf96dfcb81e..b780d299977 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index faeca56286e..654f39742b6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -3,9 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.lists cimport explode as cpp_explode
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pyx b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
index 91b2b0ea65b..5aa46c142f6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/merge.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport merge as cpp_merge
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order, size_type
+from cudf._lib.pylibcudf.libcudf cimport merge as cpp_merge
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
index a613e877ce2..935efd4acf2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
index d12da712fcf..c272f183007 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
@@ -4,18 +4,22 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 
-from cudf._lib.cpp cimport reduce as cpp_reduce
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reduce cimport scan_type
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf cimport reduce as cpp_reduce
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
-from cudf._lib.cpp.reduce import scan_type as ScanType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.reduce import \
+    scan_type as ScanType  # no-cython-lint
 
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
index fc42b985c8e..40484c728db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
@@ -2,7 +2,7 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
index dd3a733ee3a..6e08e8f64a9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
@@ -7,10 +7,10 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport replace as cpp_replace
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport replace as cpp_replace
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.replace import \
+from cudf._lib.pylibcudf.libcudf.replace import \
     replace_policy as ReplacePolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
index 88d683c0c35..cdadee68d43 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
index 8a1d83911ca..7aa7828a5dd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
@@ -4,10 +4,10 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport rolling as cpp_rolling
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf cimport rolling as cpp_rolling
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
index 85744eca902..3de86d93519 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 4a2d8f393bd..6799c37cea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx
index a186167af13..151a39f204f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/search.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx
@@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport search as cpp_search
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf cimport search as cpp_search
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
index 3ed241622c0..a4ea541a03b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
@@ -2,8 +2,13 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    null_order,
+    null_policy,
+    order,
+    size_type,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
index 1668a3efc7c..8c5a8e26899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport sorting as cpp_sorting
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport null_order, null_policy, order
+from cudf._lib.pylibcudf.libcudf cimport sorting as cpp_sorting
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, null_policy, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
index 29acc21fc05..6f89aaf90e7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
@@ -1,7 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
index af7a85d31bf..43449d3690a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
@@ -4,11 +4,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport stream_compaction as cpp_stream_compaction
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf cimport (
+    stream_compaction as cpp_stream_compaction,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
@@ -16,7 +20,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.stream_compaction import \
+from cudf._lib.pylibcudf.libcudf.stream_compaction import \
     duplicate_keep_option as DuplicateKeepOption  # no-cython-lint, isort:skip
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
new file mode 100644
index 00000000000..c42b57ece63
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources case.pyx find.pyx)
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
new file mode 100644
index 00000000000..33e2d56c087
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . cimport case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
new file mode 100644
index 00000000000..9220f6bd045
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
new file mode 100644
index 00000000000..225d566fe06
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input)
+cpdef Column to_upper(Column input)
+cpdef Column swapcase(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
new file mode 100644
index 00000000000..3a360fd6b10
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport case as cpp_case
+
+
+cpdef Column to_lower(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_lower(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column to_upper(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_upper(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column swapcase(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.swapcase(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
new file mode 100644
index 00000000000..bb43069f190
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
new file mode 100644
index 00000000000..a0214efd0a1
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport find as cpp_find
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """Returns a column of character position values where the target string is
+    first found in each string of the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`find`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    target.view(),
+                    start
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get())),
+                    start,
+                    stop
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """
+    Returns a column of character position values where the target string is
+    first found searching from the end of each string.
+
+    For details, see :cpp:func:`rfind`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Scalar
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    with nogil:
+        result = move(
+            cpp_find.rfind(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
+            )
+        )
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    corresponding target string was found within that string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that contains the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the beginning of the string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`starts_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the beginning of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that starts with the target
+    """
+    cdef unique_ptr[column] result
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the end of the string in the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`ends_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the end of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that ends with the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 327f3911489..e476fc770e3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -2,8 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef class Table:
@@ -12,6 +12,9 @@ cdef class Table:
 
     cdef table_view view(self) nogil
 
+    cpdef int num_columns(self)
+    cpdef int num_rows(self)
+
     @staticmethod
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 793e6330244..d93ac78721b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -5,9 +5,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
@@ -77,6 +77,14 @@ cdef class Table:
             for i in range(tv.num_columns())
         ])
 
+    cpdef int num_columns(self):
+        """The number of columns in this table."""
+        return len(self._columns)
+
+    cpdef int num_rows(self):
+        """The number of rows in this table."""
+        return self._columns[0].size()
+
     cpdef list columns(self):
         """The columns in this table."""
         return self._columns
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 6c53636d332..e54a259819e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     mask_state,
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index f6ff6e5a2fc..de10196e289 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,17 +2,17 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
-from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import interpolation as Interpolation  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_equality as NullEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_order as NullOrder  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import order as Order  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import sorted as Sorted  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
 
 cdef class DataType:
@@ -28,7 +28,14 @@ cdef class DataType:
         The scale associated with the data. Only used for decimal data types.
     """
     def __cinit__(self, type_id id, int32_t scale=0):
-        self.c_obj = data_type(id, scale)
+        if (
+            id == type_id.DECIMAL32
+            or id == type_id.DECIMAL64
+            or id == type_id.DECIMAL128
+        ):
+            self.c_obj = data_type(id, scale)
+        else:
+            self.c_obj = data_type(id)
 
     # TODO: Consider making both id and scale cached properties.
     cpdef type_id id(self):
@@ -39,6 +46,11 @@ cdef class DataType:
         """Get the scale associated with this data type."""
         return self.c_obj.scale()
 
+    def __eq__(self, other):
+        return type(self) is type(other) and (
+            self.c_obj == (<DataType>other).c_obj
+        )
+
     @staticmethod
     cdef DataType from_libcudf(data_type dt):
         """Create a DataType from a libcudf data_type.
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index b4372db4ae2..4aa4543bb80 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 437dd313e85..0879b501a49 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -3,11 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport unary as cpp_unary
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf cimport unary as cpp_unary
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
-from cudf._lib.cpp.unary import \
+from cudf._lib.pylibcudf.libcudf.unary import \
     unary_operator as UnaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
index 7efeaaf7e24..77c05086397 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
@@ -3,8 +3,8 @@
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 
 cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
index ea34a87a72a..b4427e8ecff 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -6,8 +6,8 @@ from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 from .scalar cimport Scalar
 
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index d3a02fa7cbf..3d20454a7ce 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -17,15 +17,20 @@ from cudf._lib.types cimport (
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.quantiles cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.quantiles cimport (
     quantile as cpp_quantile,
     quantiles as cpp_quantile_table,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport interpolation, null_order, order, sorted
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    interpolation,
+    null_order,
+    order,
+    sorted,
+)
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index c237b7b1389..48e386bcf02 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reshape cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index 7eddb1b8cbd..c1c36dd8854 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.round cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.round cimport (
     round as cpp_round,
     rounding_method as cpp_rounding_method,
 )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 154ee22e796..b57acbb37f1 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 7ddf4ff4883..e68398498d1 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -17,27 +17,27 @@ from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf cimport Scalar as plc_Scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     list_scalar,
     scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.durations cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.cpp.wrappers.timestamps cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
@@ -206,7 +206,7 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return (<pylibcudf.Scalar> self.c_value).c_obj.get()
+        return (<plc_Scalar> self.c_value).c_obj.get()
 
     cpdef bool is_valid(self):
         """
@@ -230,7 +230,7 @@ cdef class DeviceScalar:
         """
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         # Note: This line requires pylibcudf to be cimported
-        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
+        s.c_value = plc_Scalar.from_libcudf(move(ptr))
         s._set_dtype(dtype)
         return s
 
@@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None):
 def _is_null_host_scalar(slr):
     if cudf.utils.utils.is_na_like(slr):
         return True
-    elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
+    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
+            slr is pd.NaT:
         return True
     else:
         return False
@@ -368,11 +369,11 @@ def _create_proxy_nat_scalar(dtype):
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
             _set_datetime64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         elif dtype.type == np.timedelta64:
             _set_timedelta64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         return result
     else:
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index b2b84c17cf4..ff9565b9a89 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -10,11 +10,11 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.search cimport lower_bound, upper_bound
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order as cpp_order
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.search cimport lower_bound, upper_bound
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order as cpp_order
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     table_view_from_columns,
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 3826e71f850..dfad7fd101c 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -12,39 +12,39 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_booleans cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     from_booleans as cpp_from_booleans,
     to_booleans as cpp_to_booleans,
 )
-from cudf._lib.cpp.strings.convert.convert_datetime cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
     from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
     to_timestamps as cpp_to_timestamps,
 )
-from cudf._lib.cpp.strings.convert.convert_durations cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
     from_durations as cpp_from_durations,
     to_durations as cpp_to_durations,
 )
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
 )
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
     integers_to_hex as cpp_integers_to_hex,
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
     integers_to_ipv4 as cpp_integers_to_ipv4,
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import cudf
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index c1b69dda353..1f3d7c4eb1b 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.attributes cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.attributes cimport (
     code_points as cpp_code_points,
     count_bytes as cpp_count_bytes,
     count_characters as cpp_count_characters,
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index f6a80ac8fbe..1420a2bbaf2 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.capitalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport (
     capitalize as cpp_capitalize,
     is_title as cpp_is_title,
     title as cpp_title,
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 09af1178946..38f242a67d6 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -1,48 +1,34 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.case cimport (
-    swapcase as cpp_swapcase,
-    to_lower as cpp_to_lower,
-    to_upper as cpp_to_upper,
-)
+
+from cudf._lib.pylibcudf.strings import case
 
 
 @acquire_spill_lock()
 def to_upper(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_upper(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_upper(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def to_lower(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_lower(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_lower(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def swapcase(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_swapcase(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.swapcase(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 14d78cdaa51..5b7b6d19d9e 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 
 from libcpp cimport bool
@@ -8,10 +8,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.char_types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types,
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 7d86d34ab25..288f333d4d8 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,17 +6,17 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
     output_if_empty_list,
     separator_on_nulls,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82034f7f8b7..087acd8062d 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -10,17 +10,17 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.contains cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
     contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
 )
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 2085d5c2896..6faff606226 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import cudf
 
@@ -8,14 +8,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
     from_fixed_point as cpp_from_fixed_point,
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index d1617d85593..341cbc99dab 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     is_float as cpp_is_float,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 52a4791775a..081b03cdc0d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_integer as cpp_is_integer,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 1a89fa7604b..4418bf2a72d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_lists cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_lists cimport (
     format_list_column as cpp_format_list_column,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index bc8123281f0..5f62efe5c00 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_urls cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_urls cimport (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
 )
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index d3d8610cdf0..3b80c4f6368 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index f6dd3b80de9..3c0009ee569 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,23 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.find cimport (
-    contains as cpp_contains,
-    ends_with as cpp_ends_with,
-    find as cpp_find,
-    rfind as cpp_rfind,
-    starts_with as cpp_starts_with,
-)
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
@@ -26,23 +13,13 @@ def contains(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain the pattern given in `py_target`.
     """
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def contains_multiple(Column source_strings, Column target_strings):
@@ -50,17 +27,12 @@ def contains_multiple(Column source_strings, Column target_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain the corresponding string in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -70,23 +42,13 @@ def endswith(Column source_strings, object py_target):
     that contain strings that end with the pattern given in `py_target`.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def endswith_multiple(Column source_strings, Column target_strings):
@@ -95,17 +57,12 @@ def endswith_multiple(Column source_strings, Column target_strings):
     that contain strings that end with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -114,24 +71,13 @@ def startswith(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain strings that start with the pattern given in `py_target`.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def startswith_multiple(Column source_strings, Column target_strings):
@@ -140,17 +86,12 @@ def startswith_multiple(Column source_strings, Column target_strings):
     that contain strings that begin with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -164,25 +105,14 @@ def find(Column source_strings,
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_find(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.find(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -197,21 +127,11 @@ def rfind(Column source_strings,
     controlled by setting `start` and `end` values.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rfind(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.rfind(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index c2a97a4fd7c..c75f28db21b 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.find_multiple cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.find_multiple cimport (
     find_multiple as cpp_find_multiple,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 6df1d32dcfe..0d409889bc8 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.findall cimport findall as cpp_findall
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 861e0daa6e3..560f284b56c 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.json cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 340d7eb52d8..9226810951f 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -7,14 +7,17 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from enum import IntEnum
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings.padding cimport pad as cpp_pad, zfill as cpp_zfill
-from cudf._lib.cpp.strings.side_type cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings.padding cimport (
+    pad as cpp_pad,
+    zfill as cpp_zfill,
+)
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport (
     side_type,
     underlying_type_t_side_type,
 )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 4896fb74f41..2b8116848cf 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings cimport repeat as cpp_repeat
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 80c9ba95fd8..880201e65a2 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
@@ -7,14 +7,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index 1fbbaa8f44f..e13880a6186 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -9,16 +9,16 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.replace_re cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.replace_re cimport (
     replace_re as cpp_replace_re,
     replace_with_backrefs as cpp_replace_with_backrefs,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index 281d131372a..be377c0f86b 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.split.partition cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.split.partition cimport (
     partition as cpp_partition,
     rpartition as cpp_rpartition,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 08c7dde921f..942235686d7 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -8,12 +8,12 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.split.split cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
     rsplit as cpp_rsplit,
     rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
@@ -23,8 +23,8 @@ from cudf._lib.cpp.strings.split.split cimport (
     split_record as cpp_split_record,
     split_record_re as cpp_split_record_re,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 2c53782d6ba..199fa5fc3b6 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,11 +6,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.strings.strip cimport strip as cpp_strip
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index e6b8cdd28ee..170c1016b89 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -8,14 +8,16 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.substring cimport slice_strings as cpp_slice_strings
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
+    slice_strings as cpp_slice_strings,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 55659e98dcb..8846e2e280d 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,15 +9,15 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.translate cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.translate cimport (
     filter_characters as cpp_filter_characters,
     filter_type,
     translate as cpp_translate,
 )
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 8b0c367e791..92750f21e4d 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.wrap cimport wrap as cpp_wrap
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index a59e6db1b72..e952492c45d 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,8 +1,8 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
     get_special_case_mapping_table as cpp_get_special_case_mapping_table,
@@ -18,8 +18,8 @@ from cudf.core.buffer import as_buffer
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
     to_string_view_array as cpp_to_string_view_array,
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index a7346cdd586..6e63b8758b8 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import TextIOBase
 
@@ -9,8 +9,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.io.text cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 808d1321b0b..53977e984c2 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,14 +1,14 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.timezone cimport (
+from cudf._lib.pylibcudf.libcudf.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index d8eb6134042..b325173f20d 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from numba.np import numpy_support
 
@@ -17,15 +17,20 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-cimport cudf._lib.cpp.transform as libcudf_transform
+cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+    type_id,
+)
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 51e49b1f27a..82b23439e6a 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
 from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index a95db84ceff..519d5ff8554 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,11 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 1b4f4617e97..895e1afc502 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -7,9 +7,11 @@ import pandas as pd
 
 from libcpp.memory cimport make_shared, shared_ptr
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_order,
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 51c69bdcaf9..c5a1e7552b9 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,8 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column_view
-from cudf._lib.cpp.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
 
 
 cdef data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index b6637e9df08..4c4cd48d6ed 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -11,10 +11,10 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 try:
     import ujson as json
@@ -59,7 +59,7 @@ cpdef generate_pandas_metadata(table, index):
     types = []
     index_levels = []
     index_descriptors = []
-
+    columns_to_convert = list(table._columns)
     # Columns
     for name, col in table._data.items():
         if cudf.get_option("mode.pandas_compatible"):
@@ -90,6 +90,7 @@ cpdef generate_pandas_metadata(table, index):
                 types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
+    materialize_index = False
     if index is not False:
         for level, name in enumerate(table._index.names):
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
@@ -107,22 +108,26 @@ cpdef generate_pandas_metadata(table, index):
                         "step": table.index.step,
                     }
                 else:
+                    materialize_index = True
                     # When `index=True`, RangeIndex needs to be materialized.
                     materialized_idx = cudf.Index(idx._values, name=idx.name)
-                    descr = \
-                        _index_level_name(
-                            index_name=materialized_idx.name,
-                            level=level,
-                            column_names=col_names
-                        )
-                    index_levels.append(materialized_idx)
-            else:
-                descr = \
-                    _index_level_name(
-                        index_name=idx.name,
+                    descr = _index_level_name(
+                        index_name=materialized_idx.name,
                         level=level,
                         column_names=col_names
                     )
+                    index_levels.append(materialized_idx)
+                    columns_to_convert.append(materialized_idx._values)
+                    col_names.append(descr)
+                    types.append(np_to_pa_dtype(materialized_idx.dtype))
+            else:
+                descr = _index_level_name(
+                    index_name=idx.name,
+                    level=level,
+                    column_names=col_names
+                )
+                columns_to_convert.append(idx._values)
+                col_names.append(descr)
                 if isinstance(idx.dtype, cudf.CategoricalDtype):
                     raise ValueError(
                         "'category' column dtypes are currently not "
@@ -141,17 +146,16 @@ cpdef generate_pandas_metadata(table, index):
                         types.append(np_to_pa_dtype(idx.dtype))
 
                 index_levels.append(idx)
-            col_names.append(name)
             index_descriptors.append(descr)
 
+    df_meta = table.head(0)
+    if materialize_index:
+        df_meta.index = df_meta.index._as_int_index()
     metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=[
-            col
-            for col in table._columns
-        ],
+        columns_to_convert=columns_to_convert,
         # It is OKAY to do `.head(0).to_pandas()` because
         # this method will extract `.columns` metadata only
-        df=table.head(0).to_pandas(),
+        df=df_meta.to_pandas(),
         column_names=col_names,
         index_levels=index_levels,
         index_descriptors=index_descriptors,
diff --git a/python/cudf/cudf/_lib/variant.pxd b/python/cudf/cudf/_lib/variant.pxd
new file mode 100644
index 00000000000..f686bf18bf7
--- /dev/null
+++ b/python/cudf/cudf/_lib/variant.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cdef extern from "<variant>" namespace "std" nogil:
+    cdef cppclass variant:
+        variant& operator=(variant&)
+        size_t index()
+
+    cdef cppclass monostate:
+        pass
+
+    cdef T* get_if[T](...)
+    cdef bool holds_alternative[T](...)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index de44f392eef..e6868ae3431 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -4,6 +4,7 @@
 
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cached_property
 from typing import Any, Literal, Set, Tuple
 
@@ -144,7 +145,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -275,6 +276,7 @@ def __getitem__(self, key):
         raise NotImplementedError()
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _copy_type_metadata(
@@ -517,7 +519,7 @@ def where(self, cond, other=None, inplace=False):
         """
         raise NotImplementedError
 
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         raise NotImplementedError
 
     def union(self, other, sort=None):
@@ -2061,7 +2063,13 @@ def dropna(self, how="any"):
             one null value. "all" drops only rows containing
             *all* null values.
         """
-
+        if how not in {"any", "all"}:
+            raise ValueError(f"{how=} must be 'any' or 'all'")
+        try:
+            if not self.hasnans:
+                return self.copy()
+        except NotImplementedError:
+            pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
         data_columns = [
@@ -2183,14 +2191,20 @@ def repeat(self, repeats, axis=None):
         """
         raise NotImplementedError
 
-    def _split_columns_by_levels(self, levels):
-        if isinstance(levels, int) and levels > 0:
-            raise ValueError(f"Out of bound level: {levels}")
-        return (
-            [self._data[self.name]],
-            [],
-            ["index" if self.name is None else self.name],
-            [],
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        # None is caught later to return RangeIndex
+        return None
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        yield (
+            "index" if self.name is None else self.name,
+            next(iter(self._columns)),
         )
 
     def _split(self, splits):
@@ -2199,3 +2213,9 @@ def _split(self, splits):
 
 def _get_result_name(left_name, right_name):
     return left_name if _is_same_name(left_name, right_name) else None
+
+
+def _return_get_indexer_result(result):
+    if cudf.get_option("mode.pandas_compatible"):
+        return result.astype("int64")
+    return result
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index fba3a98e56d..e2bdecbe67a 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from packaging import version
 
-PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.1")
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
 PANDAS_VERSION = version.parse(pd.__version__)
 
 
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4e2fad08d56..f04cae719c2 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -3,23 +3,18 @@
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Tuple, cast
+from typing import Literal, Tuple
 
 import numpy as np
-import pandas as pd
 
-import cudf
-from cudf._lib.labeling import label_bins
-from cudf._lib.search import search_sorted
 from cudf._lib.timezone import make_timezone_transition_table
-from cudf.core.column.column import as_column, build_column
-from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
-from cudf.core.dataframe import DataFrame
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.column.column import as_column
+from cudf.core.column.datetime import DatetimeColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name):
+def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -31,8 +26,8 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times
-    ("transition_times") and corresponding UTC offsets ("offsets").
+    Tuple with two columns containing the transition times
+    and corresponding UTC offsets.
     """
     try:
         # like zoneinfo, we first look in TZPATH
@@ -43,19 +38,23 @@ def get_tz_data(zone_name):
     return tz_table
 
 
-def _find_and_read_tzfile_tzpath(zone_name):
+def _find_and_read_tzfile_tzpath(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
-            return _read_tzfile_as_frame(search_path, zone_name)
+            return _read_tzfile_as_columns(search_path, zone_name)
     raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _find_and_read_tzfile_tzdata(zone_name):
+def _find_and_read_tzfile_tzdata(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
     try:
-        return _read_tzfile_as_frame(
+        return _read_tzfile_as_columns(
             str(importlib.resources.files(package_base)), zone_name
         )
     # TODO: make it so that the call to libcudf raises a
@@ -77,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name):
         raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _read_tzfile_as_frame(tzdir, zone_name):
+def _read_tzfile_as_columns(
+    tzdir, zone_name: str
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -85,91 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name):
     if not transition_times_and_offsets:
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        transition_times_and_offsets = (
-            as_column([min_date]),
-            as_column([np.timedelta64(0, "s")]),
-        )
-
-    return DataFrame._from_data(
-        dict(
-            zip(["transition_times", "offsets"], transition_times_and_offsets)
-        )
-    )
-
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+    return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
-def _find_ambiguous_and_nonexistent(
-    data: DatetimeColumn, zone_name: str
-) -> Tuple:
-    """
-    Recognize ambiguous and nonexistent timestamps for the given timezone.
-
-    Returns a tuple of columns, both of "bool" dtype and of the same
-    size as `data`, that respectively indicate ambiguous and
-    nonexistent timestamps in `data` with the value `True`.
-
-    Ambiguous and/or nonexistent timestamps are only possible if any
-    transitions occur in the time zone database for the given timezone.
-    If no transitions occur, the tuple `(False, False)` is returned.
-    """
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times = tz_data_for_zone["transition_times"]
-    offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data._time_unit}]"
-    )
 
-    if len(offsets) == 1:  # no transitions
-        return False, False
-
-    transition_times, offsets, old_offsets = (
-        transition_times[1:]._column,
-        offsets[1:]._column,
-        offsets[:-1]._column,
-    )
-
-    # Assume we have two clocks at the moment of transition:
-    # - Clock 1 is turned forward or backwards correctly
-    # - Clock 2 makes no changes
-    clock_1 = transition_times + offsets
-    clock_2 = transition_times + old_offsets
-
-    # At the start of an ambiguous time period, Clock 1 (which has
-    # been turned back) reads less than Clock 2:
-    cond = clock_1 < clock_2
-    ambiguous_begin = clock_1.apply_boolean_mask(cond)
-
-    # The end of an ambiguous time period is what Clock 2 reads at
-    # the moment of transition:
-    ambiguous_end = clock_2.apply_boolean_mask(cond)
-    ambiguous = label_bins(
-        data,
-        left_edges=ambiguous_begin,
-        left_inclusive=True,
-        right_edges=ambiguous_end,
-        right_inclusive=False,
-    ).notnull()
-
-    # At the start of a non-existent time period, Clock 2 reads less
-    # than Clock 1 (which has been turned forward):
-    cond = clock_1 > clock_2
-    nonexistent_begin = clock_2.apply_boolean_mask(cond)
-
-    # The end of the non-existent time period is what Clock 1 reads
-    # at the moment of transition:
-    nonexistent_end = clock_1.apply_boolean_mask(cond)
-    nonexistent = label_bins(
-        data,
-        left_edges=nonexistent_begin,
-        left_inclusive=True,
-        right_edges=nonexistent_end,
-        right_inclusive=False,
-    ).notnull()
-
-    return ambiguous, nonexistent
-
-
-def localize(
-    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
-) -> DatetimeTZColumn:
+def check_ambiguous_and_nonexistent(
+    ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
+) -> Tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"
@@ -178,80 +101,4 @@ def localize(
         raise NotImplementedError(
             "Only nonexistent='NaT' is currently supported"
         )
-    if isinstance(data, DatetimeTZColumn):
-        raise ValueError(
-            "Already localized. "
-            "Use `tz_convert` to convert between time zones."
-        )
-    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
-    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
-    localized = cast(
-        DatetimeColumn,
-        data._scatter_by_column(
-            data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NaT, dtype=data.dtype),
-        ),
-    )
-    gmt_data = local_to_utc(localized, zone_name)
-    return cast(
-        DatetimeTZColumn,
-        build_column(
-            data=gmt_data.base_data,
-            dtype=dtype,
-            mask=localized.base_mask,
-            size=gmt_data.size,
-            offset=gmt_data.offset,
-        ),
-    )
-
-
-def delocalize(data: DatetimeColumn) -> DatetimeColumn:
-    """
-    Convert a timezone-aware datetime column to a timezone-naive one.
-    If the column is already timezone-naive, return it as is.
-    """
-    if isinstance(data, DatetimeTZColumn):
-        return data._local_time
-    # already timezone-naive:
-    return data
-
-
-def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
-    if not isinstance(data, DatetimeTZColumn):
-        raise TypeError(
-            "Cannot convert from timezone-naive timestamps to "
-            "timezone-aware timestamps. For that, "
-            "use `tz_localize`."
-        )
-    if zone_name == str(data.dtype.tz):
-        return data.copy()
-    utc_time = data._utc_time
-    out = cast(
-        DatetimeTZColumn,
-        build_column(
-            data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
-            mask=utc_time.base_mask,
-            size=utc_time.size,
-            offset=utc_time.offset,
-        ),
-    )
-    return out
-
-
-def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
-    indices = search_sorted([transition_times], [data], "right") - 1
-    offsets_from_utc = offsets.take(indices, nullify=True)
-    return data + offsets_from_utc
-
-
-def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times_local = (transition_times + offsets).astype(data.dtype)
-    indices = search_sorted([transition_times_local], [data], "right") - 1
-    offsets_to_utc = offsets.take(indices, nullify=True)
-    return data - offsets_to_utc
+    return ambiguous, nonexistent
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 33cec21caa5..272abdece9e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -142,10 +142,10 @@ def _index_or_values_interpolation(column, index=None):
         BooleanMask(~mask, len(to_interp))
     )
 
-    known_x = known_x_and_y._index._column.values
+    known_x = known_x_and_y.index.to_cupy()
     known_y = known_x_and_y._data.columns[0].values
 
-    result = cp.interp(to_interp._index.values, known_x, known_y)
+    result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
     first_nan_idx = (mask == 0).argmax().item()
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 8d278c9c065..5c2d77033b8 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Literal, Mapping, Optional, Tuple
 
 import numpy
 from typing_extensions import Self
@@ -106,8 +106,25 @@ class BufferOwner(Serializable):
     been accessed outside of BufferOwner. In this case, we have no control
     over knowing if the data is being modified by a third party.
 
-    Use `_from_device_memory` and `_from_host_memory` to create
+    Use `from_device_memory` and `from_host_memory` to create
     a new instance from either device or host memory respectively.
+
+    Parameters
+    ----------
+    ptr
+        An integer representing a pointer to memory.
+    size
+        The size of the memory in nbytes
+    owner
+        Python object to which the lifetime of the memory allocation is tied.
+        This buffer will keep a reference to `owner`.
+    exposed
+        Pointer to the underlying memory
+
+    Raises
+    ------
+    ValueError
+        If size is negative
     """
 
     _ptr: int
@@ -117,14 +134,25 @@ class BufferOwner(Serializable):
     # The set of buffers that point to this owner.
     _slices: weakref.WeakSet[Buffer]
 
-    def __init__(self):
-        raise ValueError(
-            f"do not create a {self.__class__} directly, please "
-            "use the factory function `cudf.core.buffer.as_buffer`"
-        )
+    def __init__(
+        self,
+        *,
+        ptr: int,
+        size: int,
+        owner: object,
+        exposed: bool,
+    ):
+        if size < 0:
+            raise ValueError("size cannot be negative")
+
+        self._ptr = ptr
+        self._size = size
+        self._owner = owner
+        self._exposed = exposed
+        self._slices = weakref.WeakSet()
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create from an object providing a `__cuda_array_interface__`.
 
         No data is being copied.
@@ -151,24 +179,15 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
             If the resulting buffer has negative size
         """
 
-        # Bypass `__init__` and initialize attributes manually
-        ret = cls.__new__(cls)
-        ret._owner = data
-        ret._exposed = exposed
-        ret._slices = weakref.WeakSet()
         if isinstance(data, rmm.DeviceBuffer):  # Common case shortcut
-            ret._ptr = data.ptr
-            ret._size = data.size
+            ptr = data.ptr
+            size = data.size
         else:
-            ret._ptr, ret._size = get_ptr_and_size(
-                data.__cuda_array_interface__
-            )
-        if ret.size < 0:
-            raise ValueError("size cannot be negative")
-        return ret
+            ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
+        return cls(ptr=ptr, size=size, owner=data, exposed=exposed)
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -181,7 +200,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
@@ -196,7 +215,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         # Copy to device memory
         buf = rmm.DeviceBuffer(ptr=ptr, size=size)
         # Create from device memory
-        return cls._from_device_memory(buf, exposed=False)
+        return cls.from_device_memory(buf, exposed=False)
 
     @property
     def size(self) -> int:
@@ -375,7 +394,7 @@ def copy(self, deep: bool = True) -> Self:
             )
 
         # Otherwise, we create a new copy of the memory
-        owner = self._owner._from_device_memory(
+        owner = self._owner.from_device_memory(
             rmm.DeviceBuffer(
                 ptr=self._owner.get_ptr(mode="read") + self._offset,
                 size=self.size,
@@ -439,9 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
 
         owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
         if hasattr(frame, "__cuda_array_interface__"):
-            owner = owner_type._from_device_memory(frame, exposed=False)
+            owner = owner_type.from_device_memory(frame, exposed=False)
         else:
-            owner = owner_type._from_host_memory(frame)
+            owner = owner_type.from_host_memory(frame)
         return cls(
             owner=owner,
             offset=0,
@@ -461,36 +480,6 @@ def __str__(self) -> str:
         )
 
 
-def is_c_contiguous(
-    shape: Sequence[int], strides: Sequence[int], itemsize: int
-) -> bool:
-    """Determine if shape and strides are C-contiguous
-
-    Parameters
-    ----------
-    shape : Sequence[int]
-        Number of elements in each dimension.
-    strides : Sequence[int]
-        The stride of each dimension in bytes.
-    itemsize : int
-        Size of an element in bytes.
-
-    Return
-    ------
-    bool
-        The boolean answer.
-    """
-
-    if any(dim == 0 for dim in shape):
-        return True
-    cumulative_stride = itemsize
-    for dim, stride in zip(reversed(shape), reversed(strides)):
-        if dim > 1 and stride != cumulative_stride:
-            return False
-        cumulative_stride *= dim
-    return True
-
-
 def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     """Retrieve the pointer and size from an array interface.
 
@@ -512,7 +501,9 @@ def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or is_c_contiguous(shape, strides, itemsize):
+    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+        shape, strides, itemsize
+    ):
         nelem = math.prod(shape)
         ptr = array_interface["data"][0] or 0
         return ptr, nelem * itemsize
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 4c08016adbb..15f00fc670d 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -23,8 +23,6 @@ class ExposureTrackedBuffer(Buffer):
         The size of the slice (in bytes)
     """
 
-    _owner: BufferOwner
-
     def __init__(
         self,
         owner: BufferOwner,
@@ -32,11 +30,7 @@ def __init__(
         size: Optional[int] = None,
     ) -> None:
         super().__init__(owner=owner, offset=offset, size=size)
-        self._owner._slices.add(self)
-
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
+        self.owner._slices.add(self)
 
     def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         if mode == "write" and cudf.get_option("copy_on_write"):
@@ -72,7 +66,7 @@ def copy(self, deep: bool = True) -> Self:
             copy-on-write option (see above).
         """
         if cudf.get_option("copy_on_write"):
-            return super().copy(deep=deep or self.exposed)
+            return super().copy(deep=deep or self.owner.exposed)
         return super().copy(deep=deep)
 
     @property
@@ -98,11 +92,11 @@ def make_single_owner_inplace(self) -> None:
             Buffer representing the same device memory as `data`
         """
 
-        if len(self._owner._slices) > 1:
-            # If this is not the only slice pointing to `self._owner`, we
-            # point to a new deep copy of the owner.
+        if len(self.owner._slices) > 1:
+            # If this is not the only slice pointing to `self.owner`, we
+            # point to a new copy of our slice of `self.owner`.
             t = self.copy(deep=True)
-            self._owner = t._owner
+            self._owner = t.owner
             self._offset = t._offset
             self._size = t._size
             self._owner._slices.add(self)
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 3e654e01401..cd81149bdb8 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -10,6 +10,7 @@
 import warnings
 import weakref
 from collections import defaultdict
+from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
 from typing import Dict, List, Optional, Tuple
@@ -201,10 +202,6 @@ class SpillManager:
     This class implements tracking of all known spillable buffers, on-demand
     spilling of said buffers, and (optionally) maintains a memory usage limit.
 
-    When `spill_on_demand=True`, the manager registers an RMM out-of-memory
-    error handler, which will spill spillable buffers in order to free up
-    memory.
-
     When `device_memory_limit=<limit-in-bytes>`, the manager will try keep
     the device memory usage below the specified limit by spilling of spillable
     buffers continuously, which will introduce a modest overhead.
@@ -213,8 +210,6 @@ class SpillManager:
 
     Parameters
     ----------
-    spill_on_demand : bool
-        Enable spill on demand.
     device_memory_limit: int, optional
         If not None, this is the device memory limit in bytes that triggers
         device to host spilling. The global manager sets this to the value
@@ -230,30 +225,15 @@ class SpillManager:
     def __init__(
         self,
         *,
-        spill_on_demand: bool = False,
         device_memory_limit: Optional[int] = None,
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
         self._buffers = weakref.WeakValueDictionary()
         self._id_counter = 0
-        self._spill_on_demand = spill_on_demand
         self._device_memory_limit = device_memory_limit
         self.statistics = SpillStatistics(statistic_level)
 
-        if self._spill_on_demand:
-            # Set the RMM out-of-memory handle if not already set
-            mr = rmm.mr.get_current_device_resource()
-            if all(
-                not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
-                for m in get_rmm_memory_resource_stack(mr)
-            ):
-                rmm.mr.set_current_device_resource(
-                    rmm.mr.FailureCallbackResourceAdaptor(
-                        mr, self._out_of_memory_handle
-                    )
-                )
-
     def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         """Try to handle an out-of-memory error by spilling
 
@@ -408,8 +388,7 @@ def __repr__(self) -> str:
             dev_limit = format_bytes(self._device_memory_limit)
 
         return (
-            f"<SpillManager spill_on_demand={self._spill_on_demand} "
-            f"device_memory_limit={dev_limit} | "
+            f"<SpillManager device_memory_limit={dev_limit} | "
             f"{format_bytes(spilled)} spilled | "
             f"{format_bytes(unspilled)} ({unspillable_ratio:.0%}) "
             f"unspilled (unspillable)>"
@@ -442,12 +421,82 @@ def get_global_manager() -> Optional[SpillManager]:
     """Get the global manager or None if spilling is disabled"""
     global _global_manager_uninitialized
     if _global_manager_uninitialized:
-        manager = None
         if get_option("spill"):
             manager = SpillManager(
-                spill_on_demand=get_option("spill_on_demand"),
                 device_memory_limit=get_option("spill_device_limit"),
                 statistic_level=get_option("spill_stats"),
             )
-        set_global_manager(manager)
+            set_global_manager(manager)
+            if get_option("spill_on_demand"):
+                set_spill_on_demand_globally()
+        else:
+            set_global_manager(None)
     return _global_manager
+
+
+def set_spill_on_demand_globally() -> None:
+    """Enable spill on demand in the current global spill manager.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    """
+
+    manager = get_global_manager()
+    if manager is None:
+        raise ValueError(
+            "Cannot enable spill on demand with no global spill manager"
+        )
+    mr = rmm.mr.get_current_device_resource()
+    if any(
+        isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
+        for m in get_rmm_memory_resource_stack(mr)
+    ):
+        raise ValueError(
+            "Spill on demand (or another failure callback resource) "
+            "is already registered"
+        )
+    rmm.mr.set_current_device_resource(
+        rmm.mr.FailureCallbackResourceAdaptor(
+            mr, manager._out_of_memory_handle
+        )
+    )
+
+
+@contextmanager
+def spill_on_demand_globally():
+    """Context to enable spill on demand temporarily.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack
+    when entering the context and popped again when exiting.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    ValueError
+        If the RMM memory source stack was changed while in the context.
+    """
+    set_spill_on_demand_globally()
+    # Save the new memory resource stack for later cleanup
+    mr_stack = get_rmm_memory_resource_stack(
+        rmm.mr.get_current_device_resource()
+    )
+    try:
+        yield
+    finally:
+        mr = rmm.mr.get_current_device_resource()
+        if mr_stack != get_rmm_memory_resource_stack(mr):
+            raise ValueError(
+                "RMM memory source stack was changed while in the context"
+            )
+        rmm.mr.set_current_device_resource(mr_stack[1])
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b25af13679c..a1af3ba8c9d 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     cuda_array_interface_wrapper,
     host_memory_allocation,
 )
+from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
@@ -93,8 +94,8 @@ class SpillableBufferOwner(BufferOwner):
     def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         """Finish initialization of the spillable buffer
 
-        This implements the common initialization that `_from_device_memory`
-        and `_from_host_memory` are missing.
+        This implements the common initialization that `from_device_memory`
+        and `from_host_memory` are missing.
 
         Parameters
         ----------
@@ -119,7 +120,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -136,12 +137,12 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
         SpillableBufferOwner
             Buffer representing the same device memory as `data`
         """
-        ret = super()._from_device_memory(data, exposed=exposed)
+        ret = super().from_device_memory(data, exposed=exposed)
         ret._finalize_init(ptr_desc={"type": "gpu"})
         return ret
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -154,7 +155,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
@@ -170,11 +171,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         data = data.cast("B")  # Make sure itemsize==1
 
         # Create an already spilled buffer
-        ret = cls.__new__(cls)
-        ret._owner = None
-        ret._ptr = 0
-        ret._size = data.nbytes
-        ret._exposed = False
+        ret = cls(ptr=0, size=data.nbytes, owner=None, exposed=False)
         ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data})
         return ret
 
@@ -372,21 +369,8 @@ def __str__(self) -> str:
         )
 
 
-class SpillableBuffer(Buffer):
-    """A slice of a spillable buffer
-
-    This buffer applies the slicing and then delegates all
-    operations to its owning buffer.
-
-    Parameters
-    ----------
-    owner : SpillableBufferOwner
-        The owner of the view
-    offset : int
-        Memory offset into the owning buffer
-    size : int
-        Size of the view (in bytes)
-    """
+class SpillableBuffer(ExposureTrackedBuffer):
+    """A slice of a spillable buffer"""
 
     _owner: SpillableBufferOwner
 
@@ -397,10 +381,6 @@ def spill(self, target: str = "cpu") -> None:
     def is_spilled(self) -> bool:
         return self._owner.is_spilled
 
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
-
     @property
     def spillable(self) -> bool:
         return self._owner.spillable
@@ -412,9 +392,6 @@ def memory_info(self) -> Tuple[int, int, str]:
         (ptr, _, device_type) = self._owner.memory_info()
         return (ptr + self._offset, self.nbytes, device_type)
 
-    def mark_exposed(self) -> None:
-        self._owner.mark_exposed()
-
     def serialize(self) -> Tuple[dict, list]:
         """Serialize the Buffer
 
@@ -449,7 +426,7 @@ def serialize(self) -> Tuple[dict, list]:
                 ptr, size, _ = self.memory_info()
                 frames = [
                     Buffer(
-                        owner=BufferOwner._from_device_memory(
+                        owner=BufferOwner.from_device_memory(
                             cuda_array_interface_wrapper(
                                 ptr=ptr,
                                 size=size,
@@ -461,6 +438,22 @@ def serialize(self) -> Tuple[dict, list]:
                 ]
             return header, frames
 
+    def copy(self, deep: bool = True) -> Self:
+        from cudf.core.buffer.utils import acquire_spill_lock
+
+        if not deep:
+            return super().copy(deep=False)
+
+        if self.is_spilled:
+            # In this case, we make the new copy point to the same spilled
+            # data in host memory. We can do this since spilled data is never
+            # modified.
+            owner = self._owner.from_host_memory(self.memoryview())
+            return self.__class__(owner=owner, offset=0, size=owner.size)
+
+        with acquire_spill_lock():
+            return super().copy(deep=deep)
+
     @property
     def __cuda_array_interface__(self) -> dict:
         return {
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index c2ec7effd13..3346d05ed4a 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -133,13 +133,13 @@ def as_buffer(
     if not hasattr(data, "__cuda_array_interface__"):
         if exposed:
             raise ValueError("cannot created exposed host memory")
-        return buffer_class(owner=owner_class._from_host_memory(data))
+        return buffer_class(owner=owner_class.from_host_memory(data))
 
     # Check if `data` is owned by a known class
     owner = get_buffer_owner(data)
     if owner is None:  # `data` is new device memory
         return buffer_class(
-            owner=owner_class._from_device_memory(data, exposed=exposed)
+            owner=owner_class.from_device_memory(data, exposed=exposed)
         )
 
     # At this point, we know that `data` is owned by a known class, which
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 2a46654ccc2..e7119fcdf47 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -4,7 +4,6 @@
 isort: skip_file
 """
 
-
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 88bb4521a5b..0ff8209dcd4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -110,7 +110,7 @@ def categories(self) -> "cudf.core.index.Index":
         """
         The categories of this categorical.
         """
-        return cudf.core.index.as_index(self._column.categories)
+        return self._column.dtype.categories
 
     @property
     def codes(self) -> "cudf.Series":
@@ -165,7 +165,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1 < 2 < 10]
         """
-        return self._return_or_inplace(self._column.as_ordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=True))
 
     def as_unordered(self) -> Optional[SeriesOrIndex]:
         """
@@ -212,8 +212,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        return self._return_or_inplace(self._column.as_unordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=False))
 
     def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]:
         """
@@ -516,6 +515,10 @@ class CategoricalColumn(column.ColumnBase):
     dtype: cudf.core.dtypes.CategoricalDtype
     _codes: Optional[NumericalColumn]
     _children: Tuple[NumericalColumn]
+    _VALID_REDUCTIONS = {
+        "max",
+        "min",
+    }
     _VALID_BINARY_OPERATIONS = {
         "__eq__",
         "__ne__",
@@ -615,12 +618,6 @@ def children(self) -> Tuple[NumericalColumn]:
     def categories(self) -> ColumnBase:
         return self.dtype.categories._values
 
-    @categories.setter
-    def categories(self, value):
-        self._dtype = CategoricalDtype(
-            categories=value, ordered=self.dtype.ordered
-        )
-
     @property
     def codes(self) -> NumericalColumn:
         if self._codes is None:
@@ -631,10 +628,6 @@ def codes(self) -> NumericalColumn:
     def ordered(self) -> bool:
         return self.dtype.ordered
 
-    @ordered.setter
-    def ordered(self, value: bool):
-        self.dtype.ordered = value
-
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
@@ -710,6 +703,25 @@ def slice(
             ),
         )
 
+    def _reduce(
+        self,
+        op: str,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        # Only valid reductions are min and max
+        if not self.ordered:
+            raise TypeError(
+                f"Categorical is not ordered for operation {op} "
+                "you can use .as_ordered() to change the Categorical "
+                "to an ordered one."
+            )
+        return self._decode(
+            self.codes._reduce(op, skipna, min_count, *args, **kwargs)
+        )
+
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         other = self._wrap_binop_normalization(other)
         # TODO: This is currently just here to make mypy happy, but eventually
@@ -717,7 +729,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         if not isinstance(other, CategoricalColumn):
             raise ValueError
         # Note: at this stage we are guaranteed that the dtypes are equal.
-        if not self.ordered and op not in {"__eq__", "__ne__", "NULL_EQUALS"}:
+        if not self.ordered and op not in {
+            "__eq__",
+            "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
+        }:
             raise TypeError(
                 "The only binary operations supported by unordered "
                 "categorical columns are equality and inequality."
@@ -1056,9 +1073,6 @@ def fillna(
         """
         Fill null values with *fill_value*
         """
-        if not self.nullable:
-            return self
-
         if fill_value is not None:
             fill_is_scalar = np.isscalar(fill_value)
 
@@ -1090,6 +1104,11 @@ def fillna(
                     self.codes.dtype
                 )
 
+        # Validation of `fill_value` will have to be performed
+        # before returning self.
+        if not self.nullable:
+            return self
+
         return super().fillna(fill_value, method=method)
 
     def indices_of(
@@ -1170,9 +1189,11 @@ def _get_decategorized_column(self) -> ColumnBase:
     def copy(self, deep: bool = True) -> Self:
         result_col = super().copy(deep=deep)
         if deep:
-            result_col.categories = libcudf.copying.copy_column(
-                self.dtype._categories
+            dtype_copy = CategoricalDtype(
+                categories=self.categories.copy(),
+                ordered=self.ordered,
             )
+            result_col = cast(Self, result_col._with_type_metadata(dtype_copy))
         return result_col
 
     @cached_property
@@ -1411,31 +1432,17 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
-    def as_ordered(self):
-        out_col = self
-        if not out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=True,
-            )
-        return out_col
-
-    def as_unordered(self):
-        out_col = self
-        if out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=False,
-            )
-        return out_col
+    def as_ordered(self, ordered: bool):
+        if self.dtype.ordered == ordered:
+            return self
+        return column.build_categorical_column(
+            categories=self.categories,
+            codes=self.codes,
+            mask=self.base_mask,
+            size=self.size,
+            offset=self.offset,
+            ordered=ordered,
+        )
 
 
 def _create_empty_categorical_column(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6103bbfc971..59bae179497 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,7 +4,6 @@
 
 import builtins
 import pickle
-import warnings
 from collections import abc
 from functools import cached_property
 from itertools import chain
@@ -52,14 +51,11 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
-    _is_categorical_dtype,
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
     is_bool_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
-    is_integer_dtype,
     is_scalar,
     is_string_dtype,
 )
@@ -83,12 +79,13 @@
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
+    cudf_dtype_to_pa_type,
     find_common_type,
     get_time_unit,
+    is_column_like,
     is_mixed_with_object_dtype,
     min_scalar_type,
     min_unsigned_type,
-    np_to_pa_dtype,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
@@ -336,16 +333,27 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
-
-        data = pa.table([array], [None])
-
-        if (
-            isinstance(array.type, pa.TimestampType)
-            and array.type.tz is not None
-        ):
+        elif pa.types.is_float16(array.type):
+            raise NotImplementedError(
+                "Type casting from `float16` to `float32` is not "
+                "yet supported in pyarrow, see: "
+                "https://github.com/apache/arrow/issues/20213"
+            )
+        elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
             raise NotImplementedError(
                 "cuDF does not yet support timezone-aware datetimes"
             )
+        elif isinstance(array.type, ArrowIntervalType):
+            return cudf.core.column.IntervalColumn.from_arrow(array)
+        elif pa.types.is_large_string(array.type):
+            # Pandas-2.2+: Pandas defaults to `large_string` type
+            # instead of `string` without data-introspection.
+            # Temporary workaround until cudf has native
+            # support for `LARGE_STRING` i.e., 64 bit offsets
+            array = array.cast(pa.string())
+
+        data = pa.table([array], [None])
+
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
                 {
@@ -374,8 +382,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 size=codes.size,
                 ordered=array.type.ordered,
             )
-        elif isinstance(array.type, ArrowIntervalType):
-            return cudf.core.column.IntervalColumn.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data)[0]
 
@@ -537,13 +543,7 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        result = libcudf.copying.get_element(self, idx).value
-        if cudf.get_option("mode.pandas_compatible"):
-            if isinstance(result, np.datetime64):
-                return pd.Timestamp(result)
-            elif isinstance(result, np.timedelta64):
-                return pd.Timedelta(result)
-        return result
+        return libcudf.copying.get_element(self, idx).value
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
@@ -614,7 +614,8 @@ def _scatter_by_slice(
         start, stop, step = key.indices(len(self))
         if start >= stop:
             return None
-        num_keys = len(range(start, stop, step))
+        rng = range(start, stop, step)
+        num_keys = len(rng)
 
         self._check_scatter_key_length(num_keys, value)
 
@@ -633,7 +634,7 @@ def _scatter_by_slice(
 
         # step != 1, create a scatter map with arange
         scatter_map = as_column(
-            range(start, stop, step),
+            rng,
             dtype=cudf.dtype(np.int32),
         )
 
@@ -680,18 +681,16 @@ def _scatter_by_column(
 
     def _check_scatter_key_length(
         self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
-    ):
+    ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.
         """
-        if isinstance(value, ColumnBase):
-            if len(value) != num_keys:
-                msg = (
-                    f"Size mismatch: cannot set value "
-                    f"of size {len(value)} to indexing result of size "
-                    f"{num_keys}"
-                )
-                raise ValueError(msg)
+        if isinstance(value, ColumnBase) and len(value) != num_keys:
+            raise ValueError(
+                f"Size mismatch: cannot set value "
+                f"of size {len(value)} to indexing result of size "
+                f"{num_keys}"
+            )
 
     def fillna(
         self,
@@ -828,7 +827,7 @@ def take(
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(indices.dtype):
+        if indices.dtype.kind not in {"u", "i"}:
             indices = indices.astype(libcudf.types.size_type_dtype)
         if not libcudf.copying._gather_map_is_valid(
             indices, len(self), check_bounds, nullify
@@ -1109,11 +1108,27 @@ def __arrow_array__(self, type=None):
         )
 
     @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            f"dtype {self.dtype} is not yet supported via "
-            "`__cuda_array_interface__`"
-        )
+    def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls():
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
+            )
+        return output
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
@@ -1387,7 +1402,7 @@ def column_empty_like(
 
     if (
         hasattr(column, "dtype")
-        and _is_categorical_dtype(column.dtype)
+        and isinstance(column.dtype, cudf.CategoricalDtype)
         and dtype == column.dtype
     ):
         catcolumn = cast("cudf.core.column.CategoricalColumn", column)
@@ -1405,6 +1420,13 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
+def _has_any_nan(arbitrary):
+    return any(
+        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        for x in np.asarray(arbitrary)
+    )
+
+
 def column_empty_like_same_mask(
     column: ColumnBase, dtype: Dtype
 ) -> ColumnBase:
@@ -1687,25 +1709,12 @@ def build_categorical_column(
     return cast("cudf.core.column.CategoricalColumn", result)
 
 
-def _make_copy_replacing_NaT_with_null(column):
-    """Return a copy with NaT values replaced with nulls."""
-    if np.issubdtype(column.dtype, np.timedelta64):
-        na_value = np.timedelta64("NaT", column.time_unit)
-    elif np.issubdtype(column.dtype, np.datetime64):
-        na_value = np.datetime64("NaT", column.time_unit)
-    else:
-        raise ValueError("This type does not support replacing NaT with null.")
-
-    null = column_empty_like(column, masked=True, newsize=1)
-    out_col = cudf._lib.replace.replace(
-        column,
-        build_column(
-            as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")),
-            dtype=column.dtype,
-        ),
-        null,
-    )
-    return out_col
+def check_invalid_array(shape: tuple, dtype):
+    """Invalid ndarrays properties that are not supported"""
+    if len(shape) > 1:
+        raise ValueError("Data must be 1-dimensional")
+    elif dtype == "float16":
+        raise TypeError("Unsupported type float16")
 
 
 def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
@@ -1782,77 +1791,34 @@ def as_column(
         return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
-        shape = desc["shape"]
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
-        current_dtype = np.dtype(desc["typestr"])
-
-        if current_dtype == "float16":
-            raise TypeError("Unsupported type float16")
-
-        arb_dtype = cudf.dtype(current_dtype)
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
 
         if desc.get("mask", None) is not None:
             # Extract and remove the mask from arbitrary before
             # passing to cupy.asarray
-            mask = _mask_from_cuda_array_interface_desc(arbitrary)
-            arbitrary = SimpleNamespace(__cuda_array_interface__=desc.copy())
-            arbitrary.__cuda_array_interface__["mask"] = None
-            desc = arbitrary.__cuda_array_interface__
+            cai_copy = desc.copy()
+            mask = _mask_from_cuda_array_interface_desc(
+                arbitrary, cai_copy.pop("mask")
+            )
+            arbitrary = SimpleNamespace(__cuda_array_interface__=cai_copy)
         else:
             mask = None
 
         arbitrary = cupy.asarray(arbitrary)
-
-        if arb_dtype != current_dtype:
-            arbitrary = arbitrary.astype(arb_dtype)
-            current_dtype = arb_dtype
-
-        if (
-            desc["strides"] is not None
-            and not (arbitrary.itemsize,) == arbitrary.strides
-        ):
-            arbitrary = cupy.ascontiguousarray(arbitrary)
+        arbitrary = cupy.ascontiguousarray(arbitrary)
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
-        col = build_column(data, dtype=current_dtype, mask=mask)
-
+        col = build_column(data, dtype=arbitrary.dtype, mask=mask)
+        if (
+            nan_as_null or (mask is None and nan_as_null is None)
+        ) and col.dtype.kind == "f":
+            col = col.nans_to_nulls()
         if dtype is not None:
             col = col.astype(dtype)
-
-        if isinstance(col, cudf.core.column.CategoricalColumn):
-            return col
-        elif np.issubdtype(col.dtype, np.floating):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan))
-                col = col.set_mask(mask)
-        elif np.issubdtype(col.dtype, np.datetime64):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                col = _make_copy_replacing_NaT_with_null(col)
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
-        if pa.types.is_large_string(arbitrary.type):
-            # Pandas-2.2+: Pandas defaults to `large_string` type
-            # instead of `string` without data-introspection.
-            # Temporary workaround until cudf has native
-            # support for `LARGE_STRING` i.e., 64 bit offsets
-            arbitrary = arbitrary.cast(pa.string())
-
-        if pa.types.is_float16(arbitrary.type):
-            raise NotImplementedError(
-                "Type casting from `float16` to `float32` is not "
-                "yet supported in pyarrow, see: "
-                "https://github.com/apache/arrow/issues/20213"
-            )
-        elif (
-            pa.types.is_timestamp(arbitrary.type)
-            and arbitrary.type.tz is not None
-        ):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        elif (nan_as_null is None or nan_as_null) and pa.types.is_floating(
+        if (nan_as_null is None or nan_as_null) and pa.types.is_floating(
             arbitrary.type
         ):
             arbitrary = pc.if_else(
@@ -1860,31 +1826,12 @@ def as_column(
                 pa.nulls(len(arbitrary), type=arbitrary.type),
                 arbitrary,
             )
+        elif dtype is None and pa.types.is_null(arbitrary.type):
+            # default "empty" type
+            dtype = "str"
         col = ColumnBase.from_arrow(arbitrary)
 
-        if isinstance(arbitrary, pa.NullArray):
-            if dtype is not None:
-                # Cast the column to the `dtype` if specified.
-                new_dtype = dtype
-            elif len(arbitrary) == 0:
-                # If the column is empty, it has to be
-                # a `str` dtype.
-                new_dtype = cudf.dtype("str")
-            else:
-                # If the null column is not empty, it has to
-                # be of `object` dtype.
-                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
-
-            if cudf.get_option(
-                "mode.pandas_compatible"
-            ) and new_dtype == cudf.dtype("O"):
-                # We internally raise if we do `astype("object")`, hence
-                # need to cast to `str` since this is safe to do so because
-                # it is a null-array.
-                new_dtype = "str"
-
-            col = col.astype(new_dtype)
-        elif dtype is not None:
+        if dtype is not None:
             col = col.astype(dtype)
 
         return col
@@ -1925,7 +1872,7 @@ def as_column(
                 # pandas arrays define __arrow_array__ for better
                 # pyarrow.array conversion
                 arbitrary = arbitrary.array
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1934,7 +1881,7 @@ def as_column(
         elif isinstance(
             arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
         ):
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1958,7 +1905,7 @@ def as_column(
                 arbitrary = np.asarray(arbitrary)
             else:
                 arbitrary = cupy.asarray(arbitrary)
-            data = as_column(
+            return as_column(
                 arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
             )
         elif arbitrary.dtype.kind == "O":
@@ -1968,7 +1915,7 @@ def as_column(
             inferred_dtype = infer_dtype(arbitrary)
             if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
                 raise MixedTypeError("Cannot create column with mixed types")
-            elif inferred_dtype not in (
+            elif dtype is None and inferred_dtype not in (
                 "mixed",
                 "decimal",
                 "string",
@@ -1978,9 +1925,20 @@ def as_column(
                 raise TypeError(
                     f"Cannot convert a {inferred_dtype} of object type"
                 )
-            elif nan_as_null is False and (
-                pd.isna(arbitrary).any()
+            elif inferred_dtype == "boolean":
+                if cudf.get_option("mode.pandas_compatible"):
+                    if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
+                        raise MixedTypeError(
+                            f"Cannot have mixed values with {inferred_dtype}"
+                        )
+                elif nan_as_null is False and _has_any_nan(arbitrary):
+                    raise MixedTypeError(
+                        f"Cannot have mixed values with {inferred_dtype}"
+                    )
+            elif (
+                nan_as_null is False
                 and inferred_dtype not in ("decimal", "empty")
+                and _has_any_nan(arbitrary)
             ):
                 # Decimal can hold float("nan")
                 # All np.nan is not restricted by type
@@ -1990,7 +1948,7 @@ def as_column(
                 arbitrary,
                 from_pandas=True,
             )
-            data = as_column(
+            return as_column(
                 pyarrow_array,
                 dtype=dtype,
                 nan_as_null=nan_as_null,
@@ -2001,15 +1959,14 @@ def as_column(
                 f"{type(arbitrary).__name__} with "
                 f"{type(arbitrary.dtype).__name__} is not supported."
             )
-        if dtype is not None:
-            data = data.astype(dtype)
-
     elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
         if length is None:
             length = 1
         elif length < 0:
             raise ValueError(f"{length=} must be >=0.")
-        if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
+        if isinstance(
+            arbitrary, pd.Interval
+        ) or cudf.api.types._is_categorical_dtype(dtype):
             # No cudf.Scalar support yet
             return as_column(
                 pd.Series([arbitrary] * length),
@@ -2032,123 +1989,84 @@ def as_column(
             return ColumnBase.from_scalar(arbitrary, length)
 
     elif hasattr(arbitrary, "__array_interface__"):
-        # CUDF assumes values are always contiguous
         desc = arbitrary.__array_interface__
-        shape = desc["shape"]
-        arb_dtype = np.dtype(desc["typestr"])
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
+
         # CUDF assumes values are always contiguous
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
+        arbitrary = np.asarray(arbitrary, order="C")
 
-        arbitrary = np.asarray(arbitrary)
+        if arbitrary.ndim == 0:
+            # TODO: Or treat as scalar?
+            arbitrary = arbitrary[np.newaxis]
 
-        # Handle case that `arbitrary` elements are cupy arrays
-        if (
-            shape
-            and shape[0]
-            and hasattr(arbitrary[0], "__cuda_array_interface__")
-        ):
+        if arbitrary.dtype.kind in "OSU":
+            if pd.isna(arbitrary).any():
+                arbitrary = pa.array(arbitrary)
+            else:
+                # Let pandas potentially infer object type
+                # e.g. np.array([pd.Timestamp(...)], dtype=object) -> datetime64
+                arbitrary = pd.Series(arbitrary)
+            return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+        elif arbitrary.dtype.kind in "biuf":
+            from_pandas = nan_as_null is None or nan_as_null
             return as_column(
-                cupy.asarray(arbitrary, dtype=arbitrary[0].dtype),
-                nan_as_null=nan_as_null,
+                pa.array(arbitrary, from_pandas=from_pandas),
                 dtype=dtype,
-                length=length,
+                nan_as_null=nan_as_null,
             )
-
-        if not arbitrary.flags["C_CONTIGUOUS"]:
-            arbitrary = np.ascontiguousarray(arbitrary)
-
-        delayed_cast = False
-        if dtype is not None:
-            try:
-                dtype = np.dtype(dtype)
-            except TypeError:
-                # Some `dtype`'s can't be parsed by `np.dtype`
-                # for which we will have to cast after the column
-                # has been constructed.
-                delayed_cast = True
-            else:
-                arbitrary = arbitrary.astype(dtype)
-
-        if arb_dtype.kind == "M":
+        elif arbitrary.dtype.kind in "mM":
             time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))
+            if time_unit in ("D", "W", "M", "Y"):
+                # TODO: Raise in these cases instead of downcasting to s?
+                new_type = f"{arbitrary.dtype.type.__name__}[s]"
+                arbitrary = arbitrary.astype(new_type)
+            elif time_unit == "generic":
+                # TODO: This should probably be in cudf.dtype
+                raise TypeError(
+                    f"{arbitrary.dtype.type.__name__} must have a unit specified"
+                )
 
-            buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
+            is_nat = np.isnat(arbitrary)
+            mask = None
+            if is_nat.any():
+                if nan_as_null is None or nan_as_null:
+                    # Convert NaT to NA, which pyarrow does by default
+                    return as_column(
+                        pa.array(arbitrary),
+                        dtype=dtype,
+                        nan_as_null=nan_as_null,
+                    )
+                # Consider NaT as NA in the mask
+                # but maintain NaT as a value
+                bool_mask = as_column(~is_nat)
                 mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
-        elif arb_dtype.kind == "m":
-            time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))
-
             buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
-                mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = cudf.core.column.timedelta.TimeDeltaColumn(
-                data=buffer,
-                size=len(arbitrary),
-                mask=mask,
-                dtype=arbitrary.dtype,
-            )
-        elif (
-            arbitrary.size != 0
-            and arb_dtype.kind in ("O")
-            and isinstance(arbitrary[0], pd.Interval)
-        ):
-            # changing from pd array to series,possible arrow bug
-            interval_series = pd.Series(arbitrary)
-            data = as_column(
-                pa.Array.from_pandas(interval_series),
-                dtype=arbitrary.dtype,
-            )
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("O", "U"):
-            data = as_column(pa.array(arbitrary), dtype=dtype)
-            # There is no cast operation available for pa.Array from int to
-            # str, Hence instead of handling in pa.Array block, we
-            # will have to type-cast here.
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("f"):
-            if arb_dtype == np.dtype("float16"):
-                raise TypeError("Unsupported type float16")
-            arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
-            data = as_column(
-                cupy.asarray(arbitrary, dtype=arb_dtype),
-                nan_as_null=nan_as_null,
-            )
+            col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
+            if dtype:
+                col = col.astype(dtype)
+            return col
         else:
-            data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null)
-
-        if delayed_cast:
-            data = data.astype(cudf.dtype(dtype))
-
+            raise NotImplementedError(f"{arbitrary.dtype} not supported")
     elif (view := as_memoryview(arbitrary)) is not None:
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
         )
+    elif hasattr(arbitrary, "__array__"):
+        # e.g. test_cuda_array_interface_pytorch
+        try:
+            arbitrary = cupy.asarray(arbitrary)
+        except (ValueError, TypeError):
+            arbitrary = np.asarray(arbitrary)
+        return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
+        raise TypeError(
+            f"{type(arbitrary).__name__} must be an iterable or sequence."
+        )
+    elif isinstance(arbitrary, abc.Iterator):
+        arbitrary = list(arbitrary)
+
     # Start of arbitrary that's not handed above but dtype provided
-    elif isinstance(dtype, pd.DatetimeTZDtype):
+    if isinstance(dtype, pd.DatetimeTZDtype):
         raise NotImplementedError(
             "Use `tz_localize()` to construct timezone aware data."
         )
@@ -2179,9 +2097,20 @@ def as_column(
             pd.IntervalDtype,
             cudf.IntervalDtype,
         ),
-    ) or dtype in {"category", "interval", "str", str, np.str_}:
+    ) or dtype in {
+        "category",
+        "interval",
+        "str",
+        str,
+        np.str_,
+        object,
+        np.dtype(object),
+    }:
         if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)):
             dtype = dtype.to_pandas()
+        elif dtype == object:
+            # Unlike pandas, interpret object as "str" instead of "python object"
+            dtype = "str"
         ser = pd.Series(arbitrary, dtype=dtype)
         return as_column(ser, nan_as_null=nan_as_null)
     elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
@@ -2193,199 +2122,106 @@ def as_column(
                 return cudf.core.column.ListColumn.from_sequences(arbitrary)
             raise
         return as_column(data, nan_as_null=nan_as_null)
-    else:
-        pa_type = None
-        try:
-            if dtype is not None:
-                if is_datetime64_dtype(dtype):
-                    # Error checking only, actual construction happens
-                    # below.
-                    pa_array = pa.array(arbitrary)
-                    if (
-                        isinstance(pa_array.type, pa.TimestampType)
-                        and pa_array.type.tz is not None
-                    ):
-                        raise NotImplementedError(
-                            "cuDF does not yet support timezone-aware "
-                            "datetimes"
-                        )
-                if is_bool_dtype(dtype):
-                    # Need this special case handling for bool dtypes,
-                    # since 'boolean' & 'pd.BooleanDtype' are not
-                    # understood by np.dtype below.
-                    dtype = "bool"
-                np_dtype = np.dtype(dtype)
-                if np_dtype.kind in {"m", "M"}:
-                    unit = np.datetime_data(np_dtype)[0]
-                    if unit not in {"ns", "us", "ms", "s", "D"}:
-                        raise NotImplementedError(
-                            f"{dtype=} is not supported."
-                        )
-                pa_type = np_to_pa_dtype(np_dtype)
-            else:
-                # By default cudf constructs a 64-bit column. Setting
-                # the `default_*_bitwidth` to 32 will result in a 32-bit
-                # column being created.
-                if (
-                    cudf.get_option("default_integer_bitwidth")
-                    and infer_dtype(arbitrary) == "integer"
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("int")
-                    )
-                if cudf.get_option("default_float_bitwidth") and infer_dtype(
-                    arbitrary
-                ) in (
-                    "floating",
-                    "mixed-integer-float",
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("float")
-                    )
 
-            pyarrow_array = pa.array(
+    from_pandas = nan_as_null is None or nan_as_null
+    if dtype is not None:
+        dtype = cudf.dtype(dtype)
+        try:
+            arbitrary = pa.array(
                 arbitrary,
-                type=pa_type,
-                from_pandas=True if nan_as_null is None else nan_as_null,
+                type=cudf_dtype_to_pa_type(dtype),
+                from_pandas=from_pandas,
             )
-
-            if (
-                isinstance(pyarrow_array, pa.NullArray)
-                and pa_type is None
-                and dtype is None
-                and getattr(arbitrary, "dtype", None) == cudf.dtype("object")
+        except (pa.ArrowInvalid, pa.ArrowTypeError):
+            if not isinstance(dtype, np.dtype):
+                dtype = dtype.to_pandas()
+            arbitrary = pd.Series(arbitrary, dtype=dtype)
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
+    else:
+        for element in arbitrary:
+            # Carve-outs that cannot be parsed by pyarrow/pandas
+            if is_column_like(element):
+                # e.g. test_nested_series_from_sequence_data
+                return cudf.core.column.ListColumn.from_sequences(arbitrary)
+            elif isinstance(element, cupy.ndarray):
+                # e.g. test_series_from_cupy_scalars
+                return as_column(
+                    cupy.array(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
+            elif (
+                isinstance(element, (pd.Timestamp, pd.Timedelta))
+                or element is pd.NaT
             ):
-                # pa.array constructor returns a NullArray
-                # for empty arrays, instead of a StringArray.
-                # This issue is only specific to this dtype,
-                # all other dtypes, result in their corresponding
-                # arrow array creation.
-                dtype = cudf.dtype("str")
-                pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
-
+                # TODO: Remove this after
+                # https://github.com/apache/arrow/issues/26492
+                # is fixed.
+                return as_column(
+                    pd.Series(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
+            elif not any(element is na for na in (None, pd.NA, np.nan)):
+                # Might have NA + element like above, but short-circuit if
+                # an element pyarrow/pandas might be able to parse
+                break
+        try:
+            arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
             if (
                 cudf.get_option("mode.pandas_compatible")
-                and pa.types.is_integer(pyarrow_array.type)
-                and pyarrow_array.null_count
+                and pa.types.is_integer(arbitrary.type)
+                and arbitrary.null_count > 0
             ):
-                pyarrow_array = pyarrow_array.cast("float64").fill_null(np.nan)
-
-            data = as_column(
-                pyarrow_array,
-                dtype=dtype,
-                nan_as_null=nan_as_null,
-            )
-        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
-            if isinstance(e, MixedTypeError):
-                raise TypeError(str(e))
+                arbitrary = arbitrary.cast(pa.float64())
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and pa.types.is_integer(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("int")
+            elif cudf.get_option(
+                "default_float_bitwidth"
+            ) and pa.types.is_floating(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("float")
+        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
+            arbitrary = pd.Series(arbitrary)
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and arbitrary.dtype.kind in set("iu"):
+                dtype = _maybe_convert_to_default_type("int")
             elif (
-                isinstance(arbitrary, Sequence)
-                and len(arbitrary) > 0
-                and any(
-                    cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary
-                )
+                cudf.get_option("default_float_bitwidth")
+                and arbitrary.dtype.kind == "f"
             ):
-                # TODO: I think can be removed; covered by
-                # elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
-                # above
-                return cudf.core.column.ListColumn.from_sequences(arbitrary)
-            elif isinstance(arbitrary, abc.Iterable) or isinstance(
-                arbitrary, abc.Sequence
-            ):
-                data = as_column(
-                    _construct_array(arbitrary, dtype),
-                    dtype=dtype,
-                    nan_as_null=nan_as_null,
-                )
-            else:
-                raise e
-    return data
-
-
-def _construct_array(
-    arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
-    """
-    Construct a CuPy/NumPy/Pandas array from `arbitrary`
-    """
-    try:
-        dtype = dtype if dtype is None else cudf.dtype(dtype)
-        arbitrary = cupy.asarray(arbitrary, dtype=dtype)
-    except (TypeError, ValueError):
-        native_dtype = dtype
-        inferred_dtype = infer_dtype(arbitrary, skipna=False)
-        if (
-            dtype is None
-            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and inferred_dtype
-            in (
-                "mixed",
-                "mixed-integer",
-            )
-        ):
-            native_dtype = "object"
-        if inferred_dtype == "interval":
-            # Only way to construct an Interval column.
-            return pd.array(arbitrary)
-        elif (
-            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
-        ):
-            # We may have date-like strings with timezones
-            try:
-                with warnings.catch_warnings():
-                    # Need to ignore userwarnings when
-                    # datetime format cannot be inferred.
-                    warnings.simplefilter("ignore", UserWarning)
-                    pd_arbitrary = pd.to_datetime(arbitrary)
-                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
-                    raise NotImplementedError(
-                        "cuDF does not yet support timezone-aware datetimes"
-                    )
-                return pd_arbitrary.to_numpy()
-            except pd.errors.OutOfBoundsDatetime:
-                # https://github.com/pandas-dev/pandas/issues/55096
-                pass
-
-        arbitrary = np.asarray(
-            arbitrary,
-            dtype=native_dtype
-            if native_dtype is None
-            else np.dtype(native_dtype),
-        )
-    return arbitrary
-
-
-def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
-    desc = obj.__cuda_array_interface__
-    mask = desc.get("mask", None)
-
-    if mask is not None:
-        desc = mask.__cuda_array_interface__
-        ptr = desc["data"][0]
-        nelem = desc["shape"][0]
-        typestr = desc["typestr"]
-        typecode = typestr[1]
-        if typecode == "t":
-            mask_size = bitmask_allocation_size_bytes(nelem)
-            mask = as_buffer(data=ptr, size=mask_size, owner=obj)
-        elif typecode == "b":
-            col = as_column(mask)
-            mask = bools_to_mask(col)
-        else:
-            raise NotImplementedError(
-                f"Cannot infer mask from typestr {typestr}"
-            )
-    return mask
+                dtype = _maybe_convert_to_default_type("float")
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
+
+
+def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
+    desc = cai_mask.__cuda_array_interface__
+    typestr = desc["typestr"]
+    typecode = typestr[1]
+    if typecode == "t":
+        mask_size = bitmask_allocation_size_bytes(desc["shape"][0])
+        return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
+    elif typecode == "b":
+        col = as_column(cai_mask)
+        return bools_to_mask(col)
+    else:
+        raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
-def serialize_columns(columns) -> Tuple[List[dict], List]:
+def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
+
     Parameters
     ----------
     columns : list
         list of Columns to serialize
+
     Returns
     -------
     headers : list
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9a5d9dcd47a..d92a3a00641 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -3,10 +3,11 @@
 from __future__ import annotations
 
 import datetime
+import functools
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Mapping, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -15,6 +16,8 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
 from cudf._typing import (
     ColumnBinaryOperand,
     DatetimeLikeScalar,
@@ -24,12 +27,15 @@
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf.core.column.numerical import NumericalColumn
+
 if PANDAS_GE_220:
     _guess_datetime_format = pd.tseries.api.guess_datetime_format
 else:
@@ -241,6 +247,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError(f"{self.dtype} is not a supported datetime type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -256,26 +264,26 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.datetime64:
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: ScalarLike) -> bool:
         try:
-            item_as_dt64 = np.datetime64(item, self._time_unit)
-        except ValueError:
-            # If item cannot be converted to datetime type
-            # np.datetime64 raises ValueError, hence `item`
-            # cannot exist in `self`.
+            ts = pd.Timestamp(item).as_unit(self.time_unit)
+        except Exception:
+            # pandas can raise a variety of errors
+            # item cannot exist in self.
+            return False
+        if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype):
             return False
-        return item_as_dt64.astype("int64") in self.as_numerical_column(
+        elif ts.tzinfo is not None:
+            ts = ts.tz_convert(None)
+        return ts.to_numpy().astype("int64") in self.as_numerical_column(
             "int64"
         )
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        if isinstance(self.dtype, pd.DatetimeTZDtype):
+            return self.dtype.unit
+        return np.datetime_data(self.dtype)[0]
 
     @property
     def year(self) -> ColumnBase:
@@ -322,6 +330,12 @@ def values(self):
             "DateTime Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timestamp(result)
+        return result
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
@@ -390,29 +404,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-        return output
-
     def as_datetime_column(
         self, dtype: Dtype, format: str | None = None
     ) -> DatetimeColumn:
@@ -579,18 +570,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
         elif op in {
             "__eq__",
-            "NULL_EQUALS",
             "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = cudf.dtype(np.bool_)
             if isinstance(other, ColumnBase) and not isinstance(
                 other, DatetimeColumn
             ):
+                fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                 result = _all_bools_with_nulls(
-                    self, other, bool_fill_value=op == "__ne__"
+                    self, other, bool_fill_value=fill_value
                 )
                 if cudf.get_option("mode.pandas_compatible"):
-                    result = result.fillna(op == "__ne__")
+                    result = result.fillna(fill_value)
                 return result
 
         if out_dtype is None:
@@ -679,6 +672,121 @@ def _with_type_metadata(self, dtype):
             )
         return self
 
+    def _find_ambiguous_and_nonexistent(
+        self, zone_name: str
+    ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]:
+        """
+        Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+        Returns a tuple of columns, both of "bool" dtype and of the same
+        size as `self`, that respectively indicate ambiguous and
+        nonexistent timestamps in `self` with the value `True`.
+
+        Ambiguous and/or nonexistent timestamps are only possible if any
+        transitions occur in the time zone database for the given timezone.
+        If no transitions occur, the tuple `(False, False)` is returned.
+        """
+        from cudf.core._internals.timezones import get_tz_data
+
+        transition_times, offsets = get_tz_data(zone_name)
+        offsets = offsets.astype(f"timedelta64[{self.time_unit}]")  # type: ignore[assignment]
+
+        if len(offsets) == 1:  # no transitions
+            return False, False
+
+        transition_times, offsets, old_offsets = (
+            transition_times.slice(1, len(transition_times)),
+            offsets.slice(1, len(offsets)),
+            offsets.slice(0, len(offsets) - 1),
+        )
+
+        # Assume we have two clocks at the moment of transition:
+        # - Clock 1 is turned forward or backwards correctly
+        # - Clock 2 makes no changes
+        clock_1 = transition_times + offsets
+        clock_2 = transition_times + old_offsets
+
+        # At the start of an ambiguous time period, Clock 1 (which has
+        # been turned back) reads less than Clock 2:
+        cond = clock_1 < clock_2
+        ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+        # The end of an ambiguous time period is what Clock 2 reads at
+        # the moment of transition:
+        ambiguous_end = clock_2.apply_boolean_mask(cond)
+        ambiguous = label_bins(
+            self,
+            left_edges=ambiguous_begin,
+            left_inclusive=True,
+            right_edges=ambiguous_end,
+            right_inclusive=False,
+        ).notnull()
+
+        # At the start of a non-existent time period, Clock 2 reads less
+        # than Clock 1 (which has been turned forward):
+        cond = clock_1 > clock_2
+        nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+        # The end of the non-existent time period is what Clock 1 reads
+        # at the moment of transition:
+        nonexistent_end = clock_1.apply_boolean_mask(cond)
+        nonexistent = label_bins(
+            self,
+            left_edges=nonexistent_begin,
+            left_inclusive=True,
+            right_edges=nonexistent_end,
+            right_inclusive=False,
+        ).notnull()
+
+        return ambiguous, nonexistent
+
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+            get_tz_data,
+        )
+
+        if tz is None:
+            return self.copy()
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        dtype = pd.DatetimeTZDtype(self.time_unit, tz)
+        ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent(
+            tz
+        )
+        localized = self._scatter_by_column(
+            self.isnull() | (ambiguous_col | nonexistent_col),
+            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+        )
+
+        transition_times, offsets = get_tz_data(tz)
+        transition_times_local = (transition_times + offsets).astype(
+            localized.dtype
+        )
+        indices = (
+            search_sorted([transition_times_local], [localized], "right") - 1
+        )
+        offsets_to_utc = offsets.take(indices, nullify=True)
+        gmt_data = localized - offsets_to_utc
+        return DatetimeTZColumn(
+            data=gmt_data.base_data,
+            dtype=dtype,
+            mask=localized.base_mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        )
+
+    def tz_convert(self, tz: str | None):
+        raise TypeError(
+            "Cannot convert tz-naive timestamps, use tz_localize to localize"
+        )
+
 
 class DatetimeTZColumn(DatetimeColumn):
     def __init__(
@@ -745,9 +853,13 @@ def _utc_time(self):
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
-        from cudf.core._internals.timezones import utc_to_local
+        from cudf.core._internals.timezones import get_tz_data
 
-        return utc_to_local(self, str(self.dtype.tz))
+        transition_times, offsets = get_tz_data(str(self.dtype.tz))
+        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
+        indices = search_sorted([transition_times], [self], "right") - 1
+        offsets_from_utc = offsets.take(indices, nullify=True)
+        return self + offsets_from_utc
 
     def as_string_column(
         self, dtype: Dtype, format: str | None = None
@@ -770,3 +882,32 @@ def __repr__(self):
             f"{arr.to_string()}\n"
             f"dtype: {self.dtype}"
         )
+
+    def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+        )
+
+        if tz is None:
+            return self._local_time
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+
+    def tz_convert(self, tz: str | None):
+        if tz is None:
+            return self._utc_time
+        elif tz == str(self.dtype.tz):
+            return self.copy()
+        utc_time = self._utc_time
+        return type(self)(
+            data=utc_time.base_data,
+            dtype=pd.DatetimeTZDtype(self.time_unit, tz),
+            mask=utc_time.base_mask,
+            size=utc_time.size,
+            offset=utc_time.offset,
+        )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b83a6ded416..3a0f6649e21 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -38,6 +38,12 @@ class DecimalBaseColumn(NumericalBaseColumn):
     dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Decimals are not yet supported via `__cuda_array_interface__`"
+        )
+
     def as_decimal_column(
         self,
         dtype: Dtype,
@@ -342,12 +348,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            "Decimals are not yet supported via `__cuda_array_interface__`"
-        )
-
     def _with_type_metadata(
         self: "cudf.core.column.Decimal64Column", dtype: Dtype
     ) -> "cudf.core.column.Decimal64Column":
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 0f5a0eb086b..e827c7a3dd3 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -26,8 +26,7 @@ def _return_or_inplace(
         inplace: Literal[True],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @overload
     def _return_or_inplace(
@@ -36,8 +35,7 @@ def _return_or_inplace(
         inplace: Literal[False],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -45,8 +43,7 @@ def _return_or_inplace(
         new_col,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -55,8 +52,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[ParentType]:
-        ...
+    ) -> Optional[ParentType]: ...
 
     def _return_or_inplace(
         self, new_col, inplace=False, expand=False, retain_index=True
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b2bd73c9856..bab862f775f 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -2,16 +2,8 @@
 
 from __future__ import annotations
 
-from typing import (
-    Any,
-    Callable,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+import functools
+from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -36,7 +28,7 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -46,6 +38,7 @@
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
+from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     min_column_type,
     min_signed_type,
@@ -75,7 +68,6 @@ class NumericalColumn(NumericalBaseColumn):
     mask : Buffer, optional
     """
 
-    _nan_count: Optional[int]
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
     def __init__(
@@ -93,7 +85,6 @@ def __init__(
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
-        self._nan_count = None
         super().__init__(
             data,
             size=size,
@@ -105,7 +96,10 @@ def __init__(
 
     def _clear_cache(self):
         super()._clear_cache()
-        self._nan_count = None
+        try:
+            del self.nan_count
+        except AttributeError:
+            pass
 
     def __contains__(self, item: ScalarLike) -> bool:
         """
@@ -114,15 +108,14 @@ def __contains__(self, item: ScalarLike) -> bool:
         # Handles improper item types
         # Fails if item is of type None, so the handler.
         try:
-            if np.can_cast(item, self.dtype):
-                item = self.dtype.type(item)
-            else:
+            search_item = self.dtype.type(item)
+            if search_item != item and self.dtype.kind != "f":
                 return False
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
         return libcudf.search.contains(
-            self, column.as_column([item], dtype=self.dtype)
+            self, column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:
@@ -192,30 +185,6 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-
-        return output
-
     def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         if callable(unaryop):
             return libcudf.transform.transform(self, unaryop)
@@ -280,6 +249,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             "__eq__",
             "__ne__",
             "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = "bool"
 
@@ -424,23 +394,41 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
 
-    @property
+    @functools.cached_property
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
-            self._nan_count = 0
-        elif self._nan_count is None:
-            nan_col = libcudf.unary.is_nan(self)
-            self._nan_count = nan_col.sum()
-        return self._nan_count
+            return 0
+        nan_col = libcudf.unary.is_nan(self)
+        return nan_col.sum()
 
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
         lhs = cast("cudf.core.column.ColumnBase", self)
-        rhs = as_column(values, nan_as_null=False)
-
-        if isinstance(rhs, NumericalColumn):
-            rhs = rhs.astype(dtype=self.dtype)
+        try:
+            rhs = as_column(values, nan_as_null=False)
+        except (MixedTypeError, TypeError) as e:
+            # There is a corner where `values` can be of `object` dtype
+            # but have values of homogeneous type.
+            inferred_dtype = cudf.api.types.infer_dtype(values)
+            if (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
+            ) or (
+                self.dtype.kind == "f"
+                and inferred_dtype in {"floating", "integer"}
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype=self.dtype)
+            elif self.dtype.kind == "f" and inferred_dtype == "integer":
+                rhs = as_column(values, nan_as_null=False, dtype="int")
+            elif (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "floating"
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype="float")
+            else:
+                raise e
+        else:
+            if isinstance(rhs, NumericalColumn):
+                rhs = rhs.astype(dtype=self.dtype)
 
         if lhs.null_count == len(lhs):
             lhs = lhs.astype(rhs.dtype)
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index c45a9c7fd5d..541c32a2520 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -112,7 +112,13 @@ def quantile(
                 ),
             )
         else:
-            result = self._numeric_quantile(q, interpolation, exact)
+            # get sorted indices and exclude nulls
+            indices = libcudf.sort.order_by(
+                [self], [True], "first", stable=True
+            ).slice(self.null_count, len(self))
+            result = libcudf.quantiles.quantile(
+                self, q, interpolation, indices, exact
+            )
         if return_scalar:
             scalar_result = result.element_indexing(0)
             if interpolation in {"lower", "higher", "nearest"}:
@@ -178,18 +184,6 @@ def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn:
             return_scalar=True,
         )
 
-    def _numeric_quantile(
-        self, q: np.ndarray, interpolation: str, exact: bool
-    ) -> NumericalBaseColumn:
-        # get sorted indices and exclude nulls
-        indices = libcudf.sort.order_by(
-            [self], [True], "first", stable=True
-        ).slice(self.null_count, len(self))
-
-        return libcudf.quantiles.quantile(
-            self, q, interpolation, indices, exact
-        )
-
     def cov(self, other: NumericalBaseColumn) -> float:
         if (
             len(self) == 0
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fb76fcdaf39..40e58e14612 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -257,14 +257,12 @@ def byte_count(self) -> SeriesOrIndex:
     @overload
     def cat(
         self, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> str:
-        ...
+    ) -> str: ...
 
     @overload
     def cat(
         self, others, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]:
-        ...
+    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ...
 
     def cat(self, others=None, sep=None, na_rep=None):
         """
@@ -694,7 +692,7 @@ def contains(
 
         Returning an Index of booleans using only a literal pattern.
 
-        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]
+        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan]
         >>> idx = cudf.Index(data)
         >>> idx
         Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object')
@@ -4832,27 +4830,14 @@ def character_ngrams(
         2         [xyz]
         dtype: list
         """
-        ngrams = libstrings.generate_character_ngrams(self._column, n)
-
-        # convert the output to a list by just generating the
-        # offsets for the output list column
-        sn = (self.len() - (n - 1)).clip(0, None).fillna(0)  # type: ignore
-        sizes = libcudf.concat.concat_columns(
-            [column.as_column(0, dtype=np.int32, length=1), sn._column]
-        )
-        oc = libcudf.reduce.scan("cumsum", sizes, True)
-        lc = cudf.core.column.ListColumn(
-            size=self._column.size,
-            dtype=cudf.ListDtype(self._column.dtype),
-            mask=self._column.mask,
-            offset=0,
-            null_count=self._column.null_count,
-            children=(oc, ngrams),
+        result = self._return_or_inplace(
+            libstrings.generate_character_ngrams(self._column, n),
+            retain_index=True,
         )
-        result = self._return_or_inplace(lc, retain_index=True)
-
         if isinstance(result, cudf.Series) and not as_list:
-            return result.explode()
+            # before exploding, removes those lists which have 0 length
+            result = result[result.list.len() > 0]
+            return result.explode()  # type: ignore
         return result
 
     def hash_character_ngrams(
@@ -5615,6 +5600,13 @@ def data_array_view(
     ) -> cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
+
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
@@ -5965,6 +5957,7 @@ def _binaryop(
                 "__ge__",
                 "__le__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 lhs, rhs = (other, self) if reflect else (self, other)
                 return libcudf.binaryop.binaryop(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 0d24e8e5120..c6af052b56f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 from typing import Any, Optional, Sequence, cast
 
 import numpy as np
@@ -19,13 +20,6 @@
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
-_dtype_to_format_conversion = {
-    "timedelta64[ns]": "%D days %H:%M:%S",
-    "timedelta64[us]": "%D days %H:%M:%S",
-    "timedelta64[ms]": "%D days %H:%M:%S",
-    "timedelta64[s]": "%D days %H:%M:%S",
-}
-
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
@@ -87,6 +81,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError(f"{self.dtype} is not a supported duration type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -102,14 +98,9 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.timedelta64:
-            raise TypeError(f"{self.dtype} is not a supported duration type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: DatetimeLikeScalar) -> bool:
         try:
-            item = np.timedelta64(item, self._time_unit)
+            item = np.timedelta64(item, self.time_unit)
         except ValueError:
             # If item cannot be converted to duration type
             # np.timedelta64 raises ValueError, hence `item`
@@ -126,6 +117,12 @@ def values(self):
             "TimeDelta Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timedelta(result)
+        return result
+
     @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         mask = None
@@ -166,6 +163,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 "__le__",
                 "__ge__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 out_dtype = cudf.dtype(np.bool_)
             elif op == "__mod__":
@@ -188,15 +186,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
-            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+            elif op in {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
                 if isinstance(other, ColumnBase) and not isinstance(
                     other, TimeDeltaColumn
                 ):
+                    fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                     result = _all_bools_with_nulls(
-                        self, other, bool_fill_value=op == "__ne__"
+                        self,
+                        other,
+                        bool_fill_value=fill_value,
                     )
                     if cudf.get_option("mode.pandas_compatible"):
-                        result = result.fillna(op == "__ne__")
+                        result = result.fillna(fill_value)
                     return result
 
         if out_dtype is None:
@@ -219,16 +220,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             "Cannot perform binary operation on timezone-naive columns"
             " and timezone-aware timestamps."
         )
-        if isinstance(other, pd.Timestamp):
-            if other.tz is not None:
+        if isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
                 raise NotImplementedError(tz_error_msg)
-            other = other.to_datetime64()
-        elif isinstance(other, pd.Timedelta):
-            other = other.to_timedelta64()
+            other = pd.Timestamp(other).to_datetime64()
         elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
-            raise NotImplementedError(tz_error_msg)
+            other = pd.Timedelta(other).to_timedelta64()
 
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -245,13 +242,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
             return cudf.Scalar(other.astype(common_dtype))
-        elif np.isscalar(other):
+        elif is_scalar(other):
             return cudf.Scalar(other)
         return NotImplemented
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        return np.datetime_data(self.dtype)[0]
 
     def fillna(
         self,
@@ -292,9 +289,7 @@ def as_string_column(
         self, dtype: Dtype, format: str | None = None
     ) -> "cudf.core.column.StringColumn":
         if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%D days %H:%M:%S"
-            )
+            format = "%D days %H:%M:%S"
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -479,7 +474,7 @@ def components(self, index=None) -> "cudf.DataFrame":
                     _unit_to_nanoseconds_conversion[value[1]], "ns"
                 ).astype(self.dtype)
             )
-            if self._time_unit == value[1]:
+            if self.time_unit == value[1]:
                 break
 
         for name in keys_list:
@@ -571,7 +566,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         # performing division operation to extract the number
         # of nanoseconds.
 
-        if self._time_unit != "ns":
+        if self.time_unit != "ns":
             res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 33085bede78..9f3de061ee8 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import itertools
+import sys
 from collections import abc
 from functools import cached_property, reduce
 from typing import (
@@ -157,8 +158,10 @@ def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
 
     def __delitem__(self, key: Any):
+        old_ncols = len(self._data)
         del self._data[key]
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
         return len(self._data)
@@ -174,6 +177,38 @@ def __repr__(self) -> str:
         )
         return f"{type_info}\n{column_info}"
 
+    def _from_columns_like_self(
+        self, columns: abc.Iterable[ColumnBase], verify: bool = True
+    ):
+        """
+        Return a new ColumnAccessor with columns and the properties of self.
+
+        Parameters
+        ----------
+        columns : iterable of Columns
+            New columns for the ColumnAccessor.
+        verify : bool, optional
+            Whether to verify column length and type.
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
+            data = zip(self.names, columns, strict=True)
+        else:
+            columns = list(columns)
+            if len(columns) != len(self.names):
+                raise ValueError(
+                    f"The number of columns ({len(columns)}) must match "
+                    f"the number of existing column labels ({len(self.names)})."
+                )
+            data = zip(self.names, columns)
+        return type(self)(
+            data=dict(data),
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=verify,
+        )
+
     @property
     def level_names(self) -> Tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
@@ -220,7 +255,17 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    def _clear_cache(self):
+    def _clear_cache(self, old_ncols: int, new_ncols: int):
+        """
+        Clear cached attributes.
+
+        Parameters
+        ----------
+        old_ncols: int
+            len(self._data) before self._data was modified
+        new_ncols: int
+            len(self._data) after self._data was modified
+        """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
@@ -228,9 +273,12 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
-        # nrows should only be cleared if no data is present.
-        if len(self._data) == 0 and hasattr(self, "nrows"):
-            del self.nrows
+        # nrows should only be cleared if empty before/after the op.
+        if (old_ncols == 0) ^ (new_ncols == 0):
+            try:
+                del self.nrows
+            except AttributeError:
+                pass
 
     def to_pandas_index(self) -> pd.Index:
         """Convert the keys of the ColumnAccessor to a Pandas Index object."""
@@ -288,27 +336,27 @@ def insert(
         """
         name = self._pad_key(name)
 
-        ncols = len(self._data)
+        old_ncols = len(self._data)
         if loc == -1:
-            loc = ncols
-        if not (0 <= loc <= ncols):
+            loc = old_ncols
+        if not (0 <= loc <= old_ncols):
             raise ValueError(
-                "insert: loc out of bounds: must be  0 <= loc <= ncols"
+                f"insert: loc out of bounds: must be  0 <= loc <= {old_ncols}"
             )
         # TODO: we should move all insert logic here
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
-        if loc == len(self._data):
+        if loc == old_ncols:
             if validate:
                 value = column.as_column(value)
-                if len(self._data) > 0 and len(value) != self.nrows:
+                if old_ncols > 0 and len(value) != self.nrows:
                     raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
             self._data = self._data.__class__(zip(new_keys, new_values))
-        self._clear_cache()
+        self._clear_cache(old_ncols, old_ncols + 1)
 
     def copy(self, deep=False) -> ColumnAccessor:
         """
@@ -465,8 +513,10 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
             if len(self._data) > 0 and len(value) != self.nrows:
                 raise ValueError("All columns must be of equal length")
 
+        old_ncols = len(self._data)
         self._data[key] = value
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         # Might be a generator
@@ -640,10 +690,12 @@ def droplevel(self, level):
         if level < 0:
             level += self.nlevels
 
+        old_ncols = len(self._data)
         self._data = {
             _remove_key_level(key, level): value
             for key, value in self._data.items()
         }
+        new_ncols = len(self._data)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
@@ -652,7 +704,7 @@ def droplevel(self, level):
             len(self._level_names) == 1
         ):  # can't use nlevels, as it depends on multiindex
             self.multiindex = False
-        self._clear_cache()
+        self._clear_cache(old_ncols, new_ncols)
 
 
 def _keys_equal(target: Any, key: Any) -> bool:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 35588725655..1f530aa3108 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -254,7 +254,7 @@ def _getitem_tuple_arg(self, arg):
         # Step 1: Gather columns
         if isinstance(arg, tuple):
             columns_df = self._frame._get_columns_by_label(arg[1])
-            columns_df._index = self._frame._index
+            columns_df.index = self._frame.index
         else:
             columns_df = self._frame
 
@@ -357,6 +357,11 @@ def _getitem_tuple_arg(self, arg):
                     # as join is not assigning any names to index,
                     # update it over here
                     df.index.name = columns_df.index.name
+                    if not isinstance(
+                        df.index, MultiIndex
+                    ) and is_numeric_dtype(df.index.dtype):
+                        # Preserve the original index type.
+                        df.index = df.index.astype(self._frame.index.dtype)
                     df = df.sort_values(by=[tmp_col_name, cantor_name])
                     df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
                     # There were no indices found
@@ -400,12 +405,12 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             new_col = cudf.Series(value, index=idx)
-            if not self._frame.empty:
+            if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
                 )
 
-            if self._frame.empty:
+            if len(self._frame.index) == 0:
                 self._frame.index = (
                     idx if idx is not None else cudf.RangeIndex(len(new_col))
                 )
@@ -540,7 +545,7 @@ def __getitem__(self, arg):
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
-            self._frame._data.select_by_index(key[1]), self._frame._index
+            self._frame._data.select_by_index(key[1]), self._frame.index
         )
 
         if is_scalar(value):
@@ -684,9 +689,16 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
 
     @_cudf_nvtx_annotate
     def __init__(
-        self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
+        self,
+        data=None,
+        index=None,
+        columns=None,
+        dtype=None,
+        nan_as_null=no_default,
     ):
         super().__init__()
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
         if isinstance(columns, (Series, cudf.BaseIndex)):
             columns = columns.to_pandas()
@@ -698,11 +710,11 @@ def __init__(
             if index is not None:
                 if not data.index.equals(index):
                     data = data.reindex(index)
-                    index = data._index
+                    index = data.index
                 else:
                     index = as_index(index)
             else:
-                index = data._index
+                index = data.index
 
             self._index = index
 
@@ -1164,7 +1176,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1181,7 +1193,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -1215,14 +1227,14 @@ def dtypes(self):
         >>> df.dtypes
         float              float64
         int                  int64
-        datetime    datetime64[us]
+        datetime    datetime64[ns]
         string              object
         dtype: object
         """
         return pd.Series(self._dtypes, dtype="object")
 
     @property
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. DataFrame ndim is always 2."""
         return 2
 
@@ -1384,27 +1396,27 @@ def __setitem__(self, arg, value):
             else:
                 if arg in self._data:
                     if not is_scalar(value) and len(self) == 0:
+                        value = column.as_column(value)
+                        length = len(value)
+                        new_columns = (
+                            value
+                            if key == arg
+                            else column.column_empty_like(
+                                col, masked=True, newsize=length
+                            )
+                            for key, col in self._data.items()
+                        )
+                        self._data = self._data._from_columns_like_self(
+                            new_columns, verify=False
+                        )
                         if isinstance(value, (pd.Series, Series)):
                             self._index = as_index(value.index)
                         elif len(value) > 0:
-                            self._index = RangeIndex(start=0, stop=len(value))
-                        value = column.as_column(value)
-                        new_data = self._data.__class__()
-                        for key in self._data:
-                            if key == arg:
-                                new_data[key] = value
-                            else:
-                                new_data[key] = column.column_empty_like(
-                                    self._data[key],
-                                    masked=True,
-                                    newsize=len(value),
-                                )
-
-                        self._data = new_data
+                            self._index = RangeIndex(length)
                         return
                     elif isinstance(value, (pd.Series, Series)):
                         value = Series(value)._align_to_index(
-                            self._index,
+                            self.index,
                             how="right",
                             sort=False,
                             allow_non_unique=True,
@@ -1477,7 +1489,7 @@ def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
-            mem_usage.append(self._index.memory_usage())
+            mem_usage.append(self.index.memory_usage())
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
@@ -1686,7 +1698,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f._index._data.columns)
+                else list(f.index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1749,11 +1761,9 @@ def _concat(
         # least one input frame has an index, assign a new RangeIndex
         # to the result frame.
         if empty_has_index and num_empty_input_frames == len(objs):
-            out._index = cudf.RangeIndex(result_index_length)
+            out.index = cudf.RangeIndex(result_index_length)
         elif are_all_range_index and not ignore_index:
-            out._index = cudf.core.index.Index._concat(
-                [o._index for o in objs]
-            )
+            out.index = cudf.core.index.Index._concat([o.index for o in objs])
 
         # Reassign the categories for any categorical table cols
         _reassign_categories(
@@ -1761,14 +1771,14 @@ def _concat(
         )
 
         # Reassign the categories for any categorical index cols
-        if not isinstance(out._index, cudf.RangeIndex):
+        if not isinstance(out.index, cudf.RangeIndex):
             _reassign_categories(
                 categories,
-                out._index._data,
+                out.index._data,
                 indices[:first_data_column_position],
             )
-            if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index._values.dtype, cudf.CategoricalDtype
+            if not isinstance(out.index, MultiIndex) and isinstance(
+                out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -1784,8 +1794,8 @@ def _concat(
         else:
             out.columns = names
         if not ignore_index:
-            out._index.name = objs[0]._index.name
-            out._index.names = objs[0]._index.names
+            out.index.name = objs[0].index.name
+            out.index.names = objs[0].index.names
 
         return out
 
@@ -1953,7 +1963,7 @@ def _get_renderable_dataframe(self):
                 output = cudf.concat([upper, lower])
 
         output = self._clean_nulls_from_dataframe(output)
-        output._index = output._index._clean_nulls_from_index()
+        output.index = output.index._clean_nulls_from_index()
 
         return output
 
@@ -2024,7 +2034,7 @@ def _make_operands_and_index_for_binop(
         bool,
     ]:
         lhs, rhs = self._data, other
-        index = self._index
+        index = self.index
         fill_requires_key = False
         left_default: Any = False
         equal_columns = False
@@ -2035,29 +2045,24 @@ def _make_operands_and_index_for_binop(
             equal_columns = True
         elif isinstance(other, Series):
             if (
-                not can_reindex
-                and fn in cudf.utils.utils._EQUALITY_OPS
-                and (
-                    not self._data.to_pandas_index().equals(
-                        other.index.to_pandas()
-                    )
+                not (self_pd_columns := self._data.to_pandas_index()).equals(
+                    other_pd_index := other.index.to_pandas()
                 )
+                and not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
             ):
                 raise ValueError(
                     "Can only compare DataFrame & Series objects "
                     "whose columns & index are same respectively, "
                     "please reindex."
                 )
-            rhs = dict(zip(other.index.to_pandas(), other.values_host))
+            rhs = dict(zip(other_pd_index, other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
-            equal_columns = other.index.to_pandas().equals(
-                self._data.to_pandas_index()
-            )
+            equal_columns = other_pd_index.equals(self_pd_columns)
             can_use_self_column_name = (
-                equal_columns
-                or list(other._index._data.names) == self._data._level_names
+                equal_columns or other_pd_index.names == self_pd_columns.names
             )
         elif isinstance(other, DataFrame):
             if (
@@ -2074,7 +2079,7 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled DataFrame objects"
                 )
             new_lhs, new_rhs = _align_indices(self, other)
-            index = new_lhs._index
+            index = new_lhs.index
             lhs, rhs = new_lhs._data, new_rhs._data
             fill_requires_key = True
             # For DataFrame-DataFrame ops, always default to operating against
@@ -2448,7 +2453,7 @@ def scatter_by_map(
                 )
 
         partitioned_columns, output_offsets = libcudf.partitioning.partition(
-            [*(self._index._columns if keep_index else ()), *self._columns],
+            [*(self.index._columns if keep_index else ()), *self._columns],
             map_index,
             map_size,
         )
@@ -2940,82 +2945,60 @@ def set_index(
 
         if not isinstance(keys, list):
             keys = [keys]
+        if len(keys) == 0:
+            raise ValueError("No valid columns to be added to index.")
+        if append:
+            keys = [self.index] + keys
 
         # Preliminary type check
-        col_not_found = []
-        columns_to_add = []
+        labels_not_found = []
+        data_to_add = []
         names = []
         to_drop = []
         for col in keys:
-            # Is column label
+            # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    columns_to_add.append(self[col])
+                    data_to_add.append(self[col])
                     names.append(col)
                     if drop:
                         to_drop.append(col)
                 else:
-                    col_not_found.append(col)
+                    labels_not_found.append(col)
+            # index-like
+            elif isinstance(col, (MultiIndex, pd.MultiIndex)):
+                if isinstance(col, pd.MultiIndex):
+                    col = MultiIndex.from_pandas(col)
+                data_to_add.extend(col._data.columns)
+                names.extend(col.names)
+            elif isinstance(
+                col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
+            ):
+                data_to_add.append(col)
+                names.append(col.name)
             else:
-                # Try coerce into column
-                if not is_column_like(col):
-                    try:
-                        col = as_column(col)
-                    except TypeError:
-                        msg = f"{col} cannot be converted to column-like."
-                        raise TypeError(msg)
-                if isinstance(col, (MultiIndex, pd.MultiIndex)):
-                    col = (
-                        cudf.from_pandas(col)
-                        if isinstance(col, pd.MultiIndex)
-                        else col
-                    )
-                    cols = [col._data[x] for x in col._data]
-                    columns_to_add.extend(cols)
-                    names.extend(col.names)
-                else:
-                    if isinstance(col, (pd.RangeIndex, cudf.RangeIndex)):
-                        # Corner case: RangeIndex does not need to instantiate
-                        columns_to_add.append(col)
-                    else:
-                        # For pandas obj, convert to gpu obj
-                        columns_to_add.append(as_column(col))
-                    if isinstance(
-                        col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
-                    ):
-                        names.append(col.name)
-                    else:
-                        names.append(None)
-
-        if col_not_found:
-            raise KeyError(f"None of {col_not_found} are in the columns")
+                try:
+                    col = as_column(col)
+                except TypeError as err:
+                    msg = f"{col} cannot be converted to column-like."
+                    raise TypeError(msg) from err
+                data_to_add.append(col)
+                names.append(None)
 
-        if append:
-            idx_cols = [self.index._data[x] for x in self.index._data]
-            if isinstance(self.index, MultiIndex):
-                idx_names = self.index.names
-            else:
-                idx_names = [self.index.name]
-            columns_to_add = idx_cols + columns_to_add
-            names = idx_names + names
+        if labels_not_found:
+            raise KeyError(f"None of {labels_not_found} are in the columns")
 
-        if len(columns_to_add) == 0:
-            raise ValueError("No valid columns to be added to index.")
-        elif (
-            len(columns_to_add) == 1
+        if (
+            len(data_to_add) == 1
             and len(keys) == 1
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
-            idx = cudf.Index(columns_to_add[0], name=names[0])
+            # Don't turn single level MultiIndex into an Index
+            idx = cudf.Index(data_to_add[0], name=names[0])
         else:
-            idx = MultiIndex._from_data(
-                {i: col for i, col in enumerate(columns_to_add)}
-            )
+            idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
-        if not isinstance(idx, BaseIndex):
-            raise ValueError("Parameter index should be type `Index`.")
-
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
@@ -3036,8 +3019,11 @@ def where(self, cond, other=None, inplace=False):
 
         # First process the condition.
         if isinstance(cond, Series):
-            cond = self._from_data_like_self(
-                {name: cond._column for name in self._column_names},
+            cond = self._from_data(
+                self._data._from_columns_like_self(
+                    itertools.repeat(cond._column, len(self._column_names)),
+                    verify=False,
+                )
             )
         elif hasattr(cond, "__cuda_array_interface__"):
             cond = DataFrame(
@@ -3078,7 +3064,7 @@ def where(self, cond, other=None, inplace=False):
                 should be equal to number of columns of self"""
             )
 
-        out = {}
+        out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
             col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
@@ -3091,16 +3077,17 @@ def where(self, cond, other=None, inplace=False):
                     col, other_col, cond_col
                 )
 
-                out[name] = _make_categorical_like(result, self._data[name])
+                out.append(_make_categorical_like(result, self._data[name]))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
                     len(col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out[name] = col.set_mask(out_mask)
+                out.append(col.set_mask(out_mask))
 
         return self._mimic_inplace(
-            self._from_data_like_self(out), inplace=inplace
+            self._from_data_like_self(self._data._from_columns_like_self(out)),
+            inplace=inplace,
         )
 
     @docutils.doc_apply(
@@ -3181,7 +3168,7 @@ def reset_index(
         )
 
     @_cudf_nvtx_annotate
-    def insert(self, loc, name, value, nan_as_null=None):
+    def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
@@ -3196,6 +3183,8 @@ def insert(self, loc, name, value, nan_as_null=None):
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
             name=name,
@@ -3257,23 +3246,28 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
                 if not ignore_index:
-                    self._index = as_index(value.index)
-            elif len(value) > 0:
-                self._index = RangeIndex(start=0, stop=len(value))
-                new_data = self._data.__class__()
+                    self.index = as_index(value.index)
+            elif (length := len(value)) > 0:
                 if num_cols != 0:
-                    for col_name in self._data:
-                        new_data[col_name] = column.column_empty_like(
-                            self._data[col_name],
-                            masked=True,
-                            newsize=len(value),
-                        )
-                self._data = new_data
+                    ca = self._data._from_columns_like_self(
+                        (
+                            column.column_empty_like(
+                                col_data, masked=True, newsize=length
+                            )
+                            for col_data in self._data.values()
+                        ),
+                        verify=False,
+                    )
+                else:
+                    ca = ColumnAccessor({})
+                self._data = ca
+                self._index = RangeIndex(length)
+
         elif isinstance(value, (pd.Series, Series)):
             value = Series(value, nan_as_null=nan_as_null)
             if not ignore_index:
                 value = value._align_to_index(
-                    self._index, how="right", sort=False
+                    self.index, how="right", sort=False
                 )
 
         value = column.as_column(value, nan_as_null=nan_as_null)
@@ -3302,7 +3296,7 @@ def axes(self):
             Index(['key', 'k2', 'val', 'temp'], dtype='object')]
 
         """
-        return [self._index, self._data.to_pandas_index()]
+        return [self.index, self._data.to_pandas_index()]
 
     def diff(self, periods=1, axis=0):
         """
@@ -3578,7 +3572,7 @@ def rename(
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
-                and type(self.index._values) != cudf.core.column.StringColumn
+                and self.index.dtype != "object"
             ):
                 raise NotImplementedError(
                     "Implicit conversion of index to "
@@ -4320,7 +4314,7 @@ def query(self, expr, local_dict=None):
         """
         Query with a boolean expression using Numba to compile a GPU kernel.
 
-        See pandas.DataFrame.query.
+        See :meth:`pandas.DataFrame.query`.
 
         Parameters
         ----------
@@ -4798,7 +4792,6 @@ def apply_chunks(
 
         Examples
         --------
-
         For ``tpb > 1``, ``func`` is executed by ``tpb`` number of threads
         concurrently.  To access the thread id and count,
         use ``numba.cuda.threadIdx.x`` and ``numba.cuda.blockDim.x``,
@@ -4824,7 +4817,7 @@ def apply_chunks(
         ...          z = in3[i]
         ...          out1[i] = x * y + z
 
-        See also
+        See Also
         --------
         DataFrame.apply_rows
         """
@@ -4863,8 +4856,8 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         """
         key_indices = [self._column_names.index(k) for k in columns]
         if keep_index:
-            cols = [*self._index._columns, *self._columns]
-            key_indices = [i + len(self._index._columns) for i in key_indices]
+            cols = [*self.index._columns, *self._columns]
+            key_indices = [i + len(self.index._columns) for i in key_indices]
         else:
             cols = [*self._columns]
 
@@ -5029,13 +5022,13 @@ def info(
 
         lines = [str(type(self))]
 
-        index_name = type(self._index).__name__
-        if len(self._index) > 0:
-            entries_summary = f", {self._index[0]} to {self._index[-1]}"
+        index_name = type(self.index).__name__
+        if len(self.index) > 0:
+            entries_summary = f", {self.index[0]} to {self.index[-1]}"
         else:
             entries_summary = ""
         index_summary = (
-            f"{index_name}: {len(self._index)} entries{entries_summary}"
+            f"{index_name}: {len(self.index)} entries{entries_summary}"
         )
         lines.append(index_summary)
 
@@ -5451,9 +5444,11 @@ def from_arrow(cls, table):
         """
         index_col = None
         col_index_names = None
+        physical_column_md = []
         if isinstance(table, pa.Table) and isinstance(
             table.schema.pandas_metadata, dict
         ):
+            physical_column_md = table.schema.pandas_metadata["columns"]
             index_col = table.schema.pandas_metadata["index_columns"]
             if "column_indexes" in table.schema.pandas_metadata:
                 col_index_names = []
@@ -5465,10 +5460,12 @@ def from_arrow(cls, table):
             out._data._level_names = col_index_names
         if index_col:
             if isinstance(index_col[0], dict):
+                range_meta = index_col[0]
                 idx = cudf.RangeIndex(
-                    index_col[0]["start"],
-                    index_col[0]["stop"],
-                    name=index_col[0]["name"],
+                    start=range_meta["start"],
+                    stop=range_meta["stop"],
+                    step=range_meta["step"],
+                    name=range_meta["name"],
                 )
                 if len(idx) == len(out):
                     # `idx` is generated from arrow `pandas_metadata`
@@ -5481,19 +5478,34 @@ def from_arrow(cls, table):
                     # https://github.com/apache/arrow/issues/15178
                     out = out.set_index(idx)
             else:
-                out = out.set_index(index_col[0])
+                out = out.set_index(index_col)
+
+        if (
+            "__index_level_0__" in out.index.names
+            and len(out.index.names) == 1
+        ):
+            real_index_name = None
+            for md in physical_column_md:
+                if md["field_name"] == "__index_level_0__":
+                    real_index_name = md["name"]
+                    break
+            out.index.name = real_index_name
 
         return out
 
     @_cudf_nvtx_annotate
-    def to_arrow(self, preserve_index=True):
+    def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
 
         Parameters
         ----------
-        preserve_index : bool, default True
-            whether index column and its meta data needs to be saved or not
+        preserve_index : bool, optional
+            whether index column and its meta data needs to be saved
+            or not. The default of None will store the index as a
+            column, except for a RangeIndex which is stored as
+            metadata only. Setting preserve_index to True will force
+            a RangeIndex to be materialized.
 
         Returns
         -------
@@ -5524,44 +5536,45 @@ def to_arrow(self, preserve_index=True):
 
         data = self.copy(deep=False)
         index_descr = []
-        if preserve_index:
-            if isinstance(self.index, cudf.RangeIndex):
-                descr = {
-                    "kind": "range",
-                    "name": self.index.name,
-                    "start": self.index._start,
-                    "stop": self.index._stop,
-                    "step": 1,
-                }
+        write_index = preserve_index is not False
+        keep_range_index = write_index and preserve_index is None
+        index = self.index
+        index_levels = [self.index]
+        if write_index:
+            if isinstance(index, cudf.RangeIndex) and keep_range_index:
+                index_descr = [
+                    {
+                        "kind": "range",
+                        "name": index.name,
+                        "start": index.start,
+                        "stop": index.stop,
+                        "step": index.step,
+                    }
+                ]
             else:
-                if isinstance(self.index, MultiIndex):
-                    gen_names = tuple(
-                        f"level_{i}"
-                        for i, _ in enumerate(self.index._data.names)
-                    )
+                if isinstance(index, cudf.RangeIndex):
+                    index = index._as_int_index()
+                    index.name = "__index_level_0__"
+                if isinstance(index, MultiIndex):
+                    index_descr = list(index._data.names)
+                    index_levels = index.levels
                 else:
-                    gen_names = (
-                        self.index.names
-                        if self.index.name is not None
-                        else ("index",)
+                    index_descr = (
+                        index.names if index.name is not None else ("index",)
                     )
-                for gen_name, col_name in zip(
-                    gen_names, self.index._data.names
-                ):
+                for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
                         gen_name,
-                        self.index._data[col_name],
+                        index._data[col_name],
                     )
-                descr = gen_names[0]
-            index_descr.append(descr)
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
             column_names=out.schema.names,
-            index_levels=[self.index],
+            index_levels=index_levels,
             index_descriptors=index_descr,
             preserve_index=preserve_index,
             types=out.schema.types,
@@ -5619,7 +5632,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         num_cols = len(data[0])
 
         if columns is None and data.dtype.names is None:
-            names = [i for i in range(num_cols)]
+            names = range(num_cols)
 
         elif data.dtype.names is not None:
             names = data.dtype.names
@@ -5632,28 +5645,43 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
                 )
             names = columns
 
-        df = DataFrame()
-
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            for k in names:
-                df._data[k] = column.as_column(
-                    data[k], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                name: column.as_column(data[name], nan_as_null=nan_as_null)
+                for name in names
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
-        elif is_scalar(index):
-            df._index = RangeIndex(start=0, stop=len(data))
-            df = df.set_index(index)
+        if not is_scalar(index):
+            new_index = as_index(index)
         else:
-            df._index = as_index(index)
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
+            new_index = None
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
+        else:
+            level_names = None
+
+        df = cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=new_index,
+        )
+        if is_scalar(index) and index is not None:
+            df = df.set_index(index)
         return df
 
     @classmethod
@@ -5702,26 +5730,38 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
-        df = cls()
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            df._data[names[0]] = column.as_column(
-                data, nan_as_null=nan_as_null
-            )
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
-        if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)):
-            df._data.rangeindex = True
+            ca_data = {
+                names[0]: column.as_column(data, nan_as_null=nan_as_null)
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
+        if index is not None:
+            index = as_index(index)
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
         else:
-            df._index = as_index(index)
-        return df
+            level_names = None
+
+        return cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=index,
+        )
 
     @_cudf_nvtx_annotate
     def interpolate(
@@ -6667,6 +6707,10 @@ def to_parquet(
         return_metadata=False,
         use_dictionary=True,
         header_version="1.0",
+        skip_compression=None,
+        column_encoding=None,
+        column_type_length=None,
+        output_as_binary=None,
         *args,
         **kwargs,
     ):
@@ -6693,6 +6737,10 @@ def to_parquet(
             return_metadata=return_metadata,
             use_dictionary=use_dictionary,
             header_version=header_version,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
             *args,
             **kwargs,
         )
@@ -6996,7 +7044,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
 
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
-        index_names = [*self._index.names, *unique_named_levels.names]
+        index_names = [*self.index.names, *unique_named_levels.names]
         new_index = MultiIndex.from_frame(
             DataFrame._from_data(
                 dict(zip(range(0, len(new_index_columns)), new_index_columns))
@@ -7537,6 +7585,12 @@ def interleave_columns(self):
         Returns
         -------
         The interleaved columns as a single column
+
+        .. pandas-compat::
+            **DataFrame.interleave_columns**
+
+            This method does not exist in pandas but it can be run
+            as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
         """
         if ("category" == self.dtypes).any():
             raise ValueError(
@@ -7781,7 +7835,7 @@ def value_counts(
             result = result / result._column.sum()
         # Pandas always returns MultiIndex even if only one column.
         if not isinstance(result.index, MultiIndex):
-            result.index = MultiIndex._from_data(result._index._data)
+            result.index = MultiIndex._from_data(result.index._data)
         result.name = "proportion" if normalize else "count"
         return result
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 3bd342e24c2..9bb1995b836 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -51,6 +51,11 @@ def dtype(arbitrary):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
+    if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
+        # read_csv only accepts "hex"
+        # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
+        return arbitrary
+
     # use `pandas_dtype` to try and interpret
     # `arbitrary` as a Pandas extension type.
     #  Return the corresponding NumPy/cuDF type.
@@ -205,10 +210,6 @@ def ordered(self) -> bool:
         """
         return self._ordered
 
-    @ordered.setter
-    def ordered(self, value) -> None:
-        self._ordered = value
-
     @classmethod
     def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         """
@@ -1003,7 +1004,10 @@ def _is_categorical_dtype(obj):
             pd.Series,
         ),
     ):
-        return _is_categorical_dtype(obj.dtype)
+        try:
+            return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
+        except TypeError:
+            return False
     if hasattr(obj, "type"):
         if obj.type is pd.CategoricalDtype.type:
             return True
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 809bdb4e6d1..92ca76d6ceb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,6 +6,7 @@
 import itertools
 import operator
 import pickle
+import types
 import warnings
 from collections import abc
 from typing import (
@@ -91,6 +92,10 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def ndim(self) -> int:
+        raise NotImplementedError()
+
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
@@ -417,51 +422,60 @@ def __arrow_array__(self, type=None):
     @_cudf_nvtx_annotate
     def _to_array(
         self,
-        get_column_values: Callable,
-        make_empty_matrix: Callable,
+        get_array: Callable,
+        module: types.ModuleType,
+        copy: bool,
         dtype: Union[Dtype, None] = None,
         na_value=None,
-    ) -> Union[cupy.ndarray, np.ndarray]:
+    ) -> Union[cupy.ndarray, numpy.ndarray]:
         # Internal function to implement to_cupy and to_numpy, which are nearly
         # identical except for the attribute they access to generate values.
 
-        def get_column_values_na(col):
+        def to_array(
+            col: ColumnBase, dtype: np.dtype
+        ) -> Union[cupy.ndarray, numpy.ndarray]:
             if na_value is not None:
                 col = col.fillna(na_value)
-            return get_column_values(col)
+            array = get_array(col)
+            casted_array = module.asarray(array, dtype=dtype)
+            if copy and casted_array is array:
+                # Don't double copy after asarray
+                casted_array = casted_array.copy()
+            return casted_array
 
-        # Early exit for an empty Frame.
         ncol = self._num_columns
         if ncol == 0:
-            return make_empty_matrix(
-                shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
+            return module.empty(
+                shape=(len(self), ncol),
+                dtype=numpy.dtype("float64"),
+                order="F",
             )
 
         if dtype is None:
-            dtypes = [col.dtype for col in self._data.values()]
-            for dtype in dtypes:
-                if isinstance(
-                    dtype,
-                    (
-                        cudf.ListDtype,
-                        cudf.core.dtypes.DecimalDtype,
-                        cudf.StructDtype,
-                    ),
-                ):
-                    raise NotImplementedError(
-                        f"{dtype} cannot be exposed as a cupy array"
-                    )
-            dtype = find_common_type(dtypes)
+            if ncol == 1:
+                dtype = next(iter(self._data.values())).dtype
+            else:
+                dtype = find_common_type(
+                    [col.dtype for col in self._data.values()]
+                )
 
-        matrix = make_empty_matrix(
-            shape=(len(self), ncol), dtype=dtype, order="F"
-        )
-        for i, col in enumerate(self._data.values()):
-            # TODO: col.values may fail if there is nullable data or an
-            # unsupported dtype. We may want to catch and provide a more
-            # suitable error.
-            matrix[:, i] = get_column_values_na(col)
-        return matrix
+            if not isinstance(dtype, numpy.dtype):
+                raise NotImplementedError(
+                    f"{dtype} cannot be exposed as an array"
+                )
+
+        if self.ndim == 1:
+            return to_array(self._data.columns[0], dtype)
+        else:
+            matrix = module.empty(
+                shape=(len(self), ncol), dtype=dtype, order="F"
+            )
+            for i, col in enumerate(self._data.values()):
+                # TODO: col.values may fail if there is nullable data or an
+                # unsupported dtype. We may want to catch and provide a more
+                # suitable error.
+                matrix[:, i] = to_array(col, dtype)
+            return matrix
 
     # TODO: As of now, calling cupy.asarray is _much_ faster than calling
     # to_cupy. We should investigate the reasons why and whether we can provide
@@ -496,10 +510,9 @@ def to_cupy(
         cupy.ndarray
         """
         return self._to_array(
-            (lambda col: col.values.copy())
-            if copy
-            else (lambda col: col.values),
-            cupy.empty,
+            lambda col: col.values,
+            cupy,
+            copy,
             dtype,
             na_value,
         )
@@ -536,7 +549,7 @@ def to_numpy(
             )
 
         return self._to_array(
-            (lambda col: col.values_host), np.empty, dtype, na_value
+            lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
     @_cudf_nvtx_annotate
@@ -762,10 +775,17 @@ def fillna(
             else:
                 replace_val = None
             should_fill = (
-                col_name in value
-                and col.has_nulls(include_nan=True)
-                and not libcudf.scalar._is_null_host_scalar(replace_val)
-            ) or method is not None
+                (
+                    col_name in value
+                    and col.has_nulls(include_nan=True)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+                or method is not None
+                or (
+                    isinstance(col, cudf.core.column.CategoricalColumn)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+            )
             if should_fill:
                 filled_data[col_name] = col.fillna(replace_val, method)
             else:
@@ -1077,7 +1097,7 @@ def isna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1095,7 +1115,7 @@ def isna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1113,14 +1133,16 @@ def isna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.isna()
         array([False, False,  True,  True, False, False])
         """
         data_columns = (col.isnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for isna
     isnull = isna
@@ -1156,7 +1178,7 @@ def notna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1174,7 +1196,7 @@ def notna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1192,14 +1214,16 @@ def notna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.notna()
         array([ True,  True, False, False,  True,  True])
         """
         data_columns = (col.notnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for notna
     notnull = notna
@@ -1506,7 +1530,9 @@ def _encode(self):
     @_cudf_nvtx_annotate
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -1638,12 +1664,14 @@ def _apply_cupy_ufunc_to_operands(
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: col.unary_operator("not")
-                if is_bool_dtype(col.dtype)
-                else -1 * col
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (
+                    col.unary_operator("not")
+                    if col.dtype.kind == "b"
+                    else -1 * col
+                    for col in self._data.columns
+                )
+            )
         )
 
     @_cudf_nvtx_annotate
@@ -1808,12 +1836,6 @@ def all(self, axis=0, skipna=True, **kwargs):
         b    False
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.all, Series.all**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.all, Series.all**
 
@@ -1867,12 +1889,6 @@ def any(self, axis=0, skipna=True, **kwargs):
         b    True
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.any, Series.any**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.any, Series.any**
 
@@ -1909,10 +1925,9 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: _apply_inverse_column(col)
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (_apply_inverse_column(col) for col in self._data.columns)
+            )
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d995964057b..3e7a1ee6026 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,12 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
+from cudf.api.types import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_list_like,
+    is_numeric_dtype,
+)
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -35,6 +40,15 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 
+def _deprecate_collect():
+    warnings.warn(
+        "Groupby.collect is deprecated and "
+        "will be removed in a future version. "
+        "Use `.agg(list)` instead.",
+        FutureWarning,
+    )
+
+
 # The three functions below return the quantiles [25%, 50%, 75%]
 # respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
@@ -701,6 +715,11 @@ def agg(self, func):
 
         return result
 
+    def _reduce_numeric_only(self, op: str):
+        raise NotImplementedError(
+            f"numeric_only is not implemented for {type(self)}"
+        )
+
     def _reduce(
         self,
         op: str,
@@ -731,14 +750,12 @@ def _reduce(
 
             The numeric_only, min_count
         """
-        if numeric_only:
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
         if min_count != 0:
             raise NotImplementedError(
                 "min_count parameter is not implemented yet"
             )
+        if numeric_only:
+            return self._reduce_numeric_only(op)
         return self.agg(op)
 
     def _scan(self, op: str, *args, **kwargs):
@@ -932,7 +949,7 @@ def nth(self, n):
 
         result = result[sizes > n]
 
-        result._index = self.obj.index.take(
+        result.index = self.obj.index.take(
             result._data["__groupbynth_order__"]
         )
         del result._data["__groupbynth_order__"]
@@ -1021,7 +1038,7 @@ def ngroup(self, ascending=True):
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
 
-        group_ids._index = index
+        group_ids.index = index
         return self._broadcast(group_ids)
 
     def sample(
@@ -1191,9 +1208,11 @@ def deserialize(cls, header, frames):
 
     def _grouped(self, *, include_groups: bool = True):
         offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
-            [*self.obj._index._columns, *self.obj._columns]
+            [*self.obj.index._columns, *self.obj._columns]
+        )
+        grouped_keys = cudf.core.index._index_from_data(
+            dict(enumerate(grouped_key_cols))
         )
-        grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
         if isinstance(self.grouping.keys, cudf.MultiIndex):
             grouped_keys.names = self.grouping.keys.names
             to_drop = self.grouping.keys.names
@@ -1759,13 +1778,23 @@ def transform(self, function):
         --------
         agg
         """
+        if not (isinstance(function, str) or callable(function)):
+            raise TypeError(
+                "Aggregation must be a named aggregation or a callable"
+            )
         try:
             result = self.agg(function)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
-
+        # If the aggregation is a scan, don't broadcast
+        if libgroupby._is_all_scan_aggregate([[function]]):
+            if len(result) != len(self.obj):
+                raise AssertionError(
+                    "Unexpected result length for scan transform"
+                )
+            return result
         return self._broadcast(result)
 
     def rolling(self, *args, **kwargs):
@@ -2160,7 +2189,8 @@ def func(x):
     @_cudf_nvtx_annotate
     def collect(self):
         """Get a list of all the values for each column in each group."""
-        return self.agg("collect")
+        _deprecate_collect()
+        return self.agg(list)
 
     @_cudf_nvtx_annotate
     def unique(self):
@@ -2648,6 +2678,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
     def __getitem__(self, key):
         return self.obj[key].groupby(
             by=self.grouping.keys,
@@ -2808,8 +2849,8 @@ def _handle_label(self, by):
             self._key_columns.append(self._obj._data[by])
         except KeyError as e:
             # `by` can be index name(label) too.
-            if by in self._obj._index.names:
-                self._key_columns.append(self._obj._index._data[by])
+            if by in self._obj.index.names:
+                self._key_columns.append(self._obj.index._data[by])
             else:
                 raise e
         self.names.append(by)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index bd9dc1ae3da..49bfb150f60 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cache, cached_property
 from numbers import Number
 from typing import (
@@ -21,6 +22,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
@@ -36,9 +38,9 @@
     is_integer,
     is_list_like,
     is_scalar,
-    is_signed_integer_dtype,
+    is_string_dtype,
 )
-from cudf.core._base_index import BaseIndex
+from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import (
     CategoricalColumn,
@@ -60,6 +62,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -110,10 +113,16 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="left"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="left",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="right"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="right",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
 
     return lower_bound, upper_bound, sort_inds
@@ -149,11 +158,13 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
-def _index_from_columns(
-    columns: List[cudf.core.column.ColumnBase], name: Any = no_default
-):
-    """Construct an index from ``columns``, with levels named 0, 1, 2..."""
-    return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
+def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int:
+    """Validate start/stop/step argument in RangeIndex.__init__"""
+    if not is_integer(arg):
+        raise TypeError(
+            f"{arg_name} must be an integer, not {type(arg).__name__}"
+        )
+    return int(arg)
 
 
 class RangeIndex(BaseIndex, BinaryOperand):
@@ -204,44 +215,29 @@ class RangeIndex(BaseIndex, BinaryOperand):
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
-        if step == 0:
-            raise ValueError("Step must not be zero.")
         if not cudf.api.types.is_hashable(name):
             raise ValueError("Name must be a hashable value.")
-        if dtype is not None and not is_signed_integer_dtype(dtype):
+        self._name = name
+        if dtype is not None and cudf.dtype(dtype).kind != "i":
             raise ValueError(f"{dtype=} must be a signed integer type")
 
         if isinstance(start, range):
-            therange = start
-            start = therange.start
-            stop = therange.stop
-            step = therange.step
-        if stop is None:
-            start, stop = 0, start
-        if not is_integer(start):
-            raise TypeError(
-                f"start must be an integer, not {type(start).__name__}"
-            )
-        self._start = int(start)
-        if not is_integer(stop):
-            raise TypeError(
-                f"stop must be an integer, not {type(stop).__name__}"
-            )
-        self._stop = int(stop)
-        if step is not None:
-            if not is_integer(step):
-                raise TypeError(
-                    f"step must be an integer, not {type(step).__name__}"
-                )
-            self._step = int(step)
+            self._range = start
         else:
-            self._step = 1
-        self._index = None
-        self._name = name
-        self._range = range(self._start, self._stop, self._step)
-        # _end is the actual last element of RangeIndex,
-        # whereas _stop is an upper bound.
-        self._end = self._start + self._step * (len(self._range) - 1)
+            if stop is None:
+                start, stop = 0, start
+            start = validate_range_arg(start, "start")
+            stop = validate_range_arg(stop, "stop")
+            if step is not None:
+                step = validate_range_arg(step, "step")
+            else:
+                step = 1
+            try:
+                self._range = range(start, stop, step)
+            except ValueError as err:
+                if step == 0:
+                    raise ValueError("Step must not be zero.") from err
+                raise
 
     def _copy_type_metadata(
         self, other: RangeIndex, *, override_dtypes=None
@@ -258,9 +254,18 @@ def searchsorted(
         na_position: Literal["first", "last"] = "last",
     ):
         assert (len(self) <= 1) or (
-            ascending == (self._step > 0)
+            ascending == (self.step > 0)
         ), "Invalid ascending flag"
-        return search_range(value, self.as_range, side=side)
+        return search_range(value, self._range, side=side)
+
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+        if sort and self.step < 0:
+            codes = cupy.arange(len(self) - 1, -1, -1)
+            uniques = self[::-1]
+        else:
+            codes = cupy.arange(len(self), dtype=np.intp)
+            uniques = self
+        return codes, uniques
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -274,31 +279,31 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def start(self):
+    def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
-        return self._start
+        return self._range.start
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def stop(self):
+    def stop(self) -> int:
         """
         The value of the stop parameter.
         """
-        return self._stop
+        return self._range.stop
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def step(self):
+    def step(self) -> int:
         """
         The value of the step parameter.
         """
-        return self._step
+        return self._range.step
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_rows(self):
+    def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
@@ -309,33 +314,33 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return True
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
@@ -347,8 +352,12 @@ def _data(self):
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
+        hash(item)
         if isinstance(item, bool) or not isinstance(
-            item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
+            item,
+            tuple(
+                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
+            ),
         ):
             return False
         try:
@@ -375,9 +384,7 @@ def copy(self, name=None, deep=False):
 
         name = self.name if name is None else name
 
-        return RangeIndex(
-            start=self._start, stop=self._stop, step=self._step, name=name
-        )
+        return RangeIndex(self._range, name=name)
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
@@ -385,19 +392,22 @@ def astype(self, dtype, copy: bool = True):
             return self
         return self._as_int_index().astype(dtype, copy=copy)
 
+    def fillna(self, value, downcast=None):
+        return self.copy()
+
     @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
     @_cudf_nvtx_annotate
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     @_cudf_nvtx_annotate
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
-            f", step={self._step}"
+            f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
+            f", step={self.step}"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
                 if self.name is not None
@@ -406,18 +416,23 @@ def __repr__(self):
             + ")"
         )
 
+    @property
+    @_cudf_nvtx_annotate
+    def size(self) -> int:
+        return len(self)
+
     @_cudf_nvtx_annotate
     def __len__(self):
-        return len(range(self._start, self._stop, self._step))
+        return len(self._range)
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
 
-            lo = self._start + sl_start * self._step
-            hi = self._start + sl_stop * self._step
-            st = self._step * sl_step
+            lo = self.start + sl_start * self.step
+            hi = self.start + sl_stop * self.step
+            st = self.step * sl_step
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
@@ -426,18 +441,13 @@ def __getitem__(self, index):
                 index += len_self
             if not (0 <= index < len_self):
                 raise IndexError("Index out of bounds")
-            return self._start + index * self._step
+            return self.start + index * self.step
         return self._as_int_index()[index]
 
     @_cudf_nvtx_annotate
     def equals(self, other):
         if isinstance(other, RangeIndex):
-            if (self._start, self._stop, self._step) == (
-                other._start,
-                other._stop,
-                other._step,
-            ):
-                return True
+            return self._range == other._range
         return self._as_int_index().equals(other)
 
     @_cudf_nvtx_annotate
@@ -449,9 +459,9 @@ def serialize(self):
         # We don't need to store the GPU buffer for RangeIndexes
         # cuDF only needs to store start/stop and rehydrate
         # during de-serialization
-        header["index_column"]["start"] = self._start
-        header["index_column"]["stop"] = self._stop
-        header["index_column"]["step"] = self._step
+        header["index_column"]["start"] = self.start
+        header["index_column"]["stop"] = self.stop
+        header["index_column"]["step"] = self.step
         frames = []
 
         header["name"] = pickle.dumps(self.name)
@@ -491,33 +501,29 @@ def to_pandas(
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.RangeIndex(
-            start=self._start,
-            stop=self._stop,
-            step=self._step,
+            start=self.start,
+            stop=self.stop,
+            step=self.step,
             dtype=self.dtype,
             name=self.name,
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return True
 
-    @cached_property
-    def as_range(self):
-        return range(self._start, self._stop, self._step)
-
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
-        return self._step > 0 or len(self) <= 1
+    def is_monotonic_increasing(self) -> bool:
+        return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
-        return self._step < 0 or len(self) <= 1
+        return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
                 "The deep parameter is ignored and is only included "
@@ -525,7 +531,7 @@ def memory_usage(self, deep=False):
             )
         return 0
 
-    def unique(self):
+    def unique(self) -> Self:
         # RangeIndex always has unique values
         return self
 
@@ -597,12 +603,12 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
-        idx = (key - self._start) / self._step
-        idx_int_upper_bound = (self._stop - self._start) // self._step
+        idx = (key - self.start) / self.step
+        idx_int_upper_bound = (self.stop - self.start) // self.step
         if idx > idx_int_upper_bound or idx < 0:
             raise KeyError(key)
 
-        idx_int = (key - self._start) // self._step
+        idx_int = (key - self.start) // self.step
         if idx_int != idx:
             raise KeyError(key)
         return idx_int
@@ -614,9 +620,9 @@ def _union(self, other, sort=None):
             # following notation: *_o -> other, *_s -> self,
             # and *_r -> result
             start_s, step_s = self.start, self.step
-            end_s = self._end
+            end_s = self.start + self.step * (len(self) - 1)
             start_o, step_o = other.start, other.step
-            end_o = other._end
+            end_o = other.start + other.step * (len(other) - 1)
             if self.step < 0:
                 start_s, step_s, end_s = end_s, -step_s, start_s
             if other.step < 0:
@@ -848,36 +854,37 @@ def _columns(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):
-        return self.to_pandas().values
+    def values_host(self) -> np.ndarray:
+        return np.arange(start=self.start, stop=self.stop, step=self.step)
 
     @_cudf_nvtx_annotate
     def argsort(
         self,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
-
-        indices = cupy.arange(0, len(self))
-        if (ascending and self._step < 0) or (
-            not ascending and self._step > 0
-        ):
-            indices = indices[::-1]
-        return indices
+        if (ascending and self.step < 0) or (not ascending and self.step > 0):
+            return cupy.arange(len(self) - 1, -1, -1)
+        else:
+            return cupy.arange(len(self))
 
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
     @_cudf_nvtx_annotate
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
-        return self._as_int_index().to_arrow()
+    def to_cupy(self) -> cupy.ndarray:
+        return self.values
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self) -> pa.Array:
+        return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -888,17 +895,17 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self):
+    def nunique(self) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
     @_cudf_nvtx_annotate
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
@@ -922,12 +929,15 @@ def max(self):
         return self._minmax("max")
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return cupy.arange(self.start, self.stop, self.step)
 
-    def any(self):
+    def any(self) -> bool:
         return any(self._range)
 
+    def all(self) -> bool:
+        return 0 not in self._range
+
     def append(self, other):
         result = self._as_int_index().append(other)
         return self._try_reconstruct_range_index(result)
@@ -953,14 +963,27 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
-    def __neg__(self):
-        return -self._as_int_index()
+    def __pos__(self) -> Self:
+        return self.copy()
+
+    def __neg__(self) -> Self:
+        rng = range(-self.start, -self.stop, -self.step)
+        return type(self)(rng, name=self.name)
 
-    def __pos__(self):
-        return +self._as_int_index()
+    def __abs__(self) -> Self | Index:
+        if len(self) == 0 or self.min() >= 0:
+            return self.copy()
+        elif self.max() <= 0:
+            return -self
+        else:
+            return abs(self._as_int_index())
 
-    def __abs__(self):
-        return abs(self._as_int_index())
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        # We need to explicitly materialize the RangeIndex to a column
+        yield "index" if self.name is None else self.name, as_column(self)
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
@@ -988,8 +1011,7 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
 
     @_cudf_nvtx_annotate
     def __init__(self, data, **kwargs):
-        kwargs = _setdefault_name(data, **kwargs)
-        name = kwargs.get("name")
+        name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
     @_cudf_nvtx_annotate
@@ -1113,14 +1135,26 @@ def _concat(cls, objs):
             assert (
                 PANDAS_LT_300
             ), "Need to drop after pandas-3.0 support is added."
-            warnings.warn(
+            warning_msg = (
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
                 "empty items when determining the result dtype. "
                 "To retain the old behavior, exclude the empty entries before "
-                "the concat operation.",
-                FutureWarning,
+                "the concat operation."
             )
+            # Warn only if the type might _actually_ change
+            if len(non_empties) == 0:
+                if not all(objs[0].dtype == index.dtype for index in objs[1:]):
+                    warnings.warn(warning_msg, FutureWarning)
+            else:
+                common_all_type = find_common_type(
+                    [index.dtype for index in objs]
+                )
+                common_non_empty_type = find_common_type(
+                    [index.dtype for index in non_empties]
+                )
+                if common_all_type != common_non_empty_type:
+                    warnings.warn(warning_msg, FutureWarning)
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
@@ -1238,11 +1272,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         )
 
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             lcol, rcol = _match_join_keys(needle, self._column, "inner")
         except ValueError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
         (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
@@ -1269,7 +1303,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
@@ -1397,8 +1431,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = as_index(res)
-            res.name = self.name
+            res = as_index(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1506,6 +1539,7 @@ def values(self):
         return self._column.values
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _clean_nulls_from_index(self):
@@ -1590,7 +1624,7 @@ def _indices_of(self, value):
     @property
     @_cudf_nvtx_annotate
     def str(self):
-        if isinstance(self._values, cudf.core.column.StringColumn):
+        if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
         else:
             raise AttributeError(
@@ -1713,8 +1747,16 @@ def __init__(
         if dtype.kind != "M":
             raise TypeError("dtype must be a datetime type")
 
-        name = _setdefault_name(data, name=name)["name"]
-        data = column.as_column(data, dtype=dtype)
+        name = _getdefault_name(data, name=name)
+        data = column.as_column(data)
+
+        # TODO: Remove this if statement and fix tests now that
+        # there's timezone support
+        if isinstance(data.dtype, pd.DatetimeTZDtype):
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
+        data = data.astype(dtype)
 
         if copy:
             data = data.copy()
@@ -2245,7 +2287,12 @@ def round(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2287,17 +2334,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
         """  # noqa: E501
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self._column)
-        else:
-            result_col = localize(self._column, tz, ambiguous, nonexistent)
+        result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_data(
             {self.name: result_col}, freq=self._freq
         )
 
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2329,14 +2371,14 @@ def tz_convert(self, tz):
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self._column._utc_time
-        else:
-            result_col = convert(self._column, tz)
+        result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
+    def repeat(self, repeats, axis=None):
+        res = super().repeat(repeats, axis=axis)
+        res._freq = None
+        return res
+
 
 class TimedeltaIndex(Index):
     """
@@ -2424,7 +2466,7 @@ def __init__(
         if dtype.kind != "m":
             raise TypeError("dtype must be a timedelta type")
 
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
         data = column.as_column(data, dtype=dtype)
 
         if copy:
@@ -2593,7 +2635,7 @@ def __init__(
                 )
         if copy:
             data = column.as_column(data, dtype=dtype).copy(deep=True)
-        kwargs = _setdefault_name(data, name=name)
+        name = _getdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
         elif isinstance(data, pd.Series) and (
@@ -2624,10 +2666,10 @@ def __init__(
         elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)):
             data = data.set_categories(dtype.categories, ordered=ordered)
         elif ordered is True and data.ordered is False:
-            data = data.as_ordered()
+            data = data.as_ordered(ordered=True)
         elif ordered is False and data.ordered is True:
-            data = data.as_unordered()
-        super().__init__(data, **kwargs)
+            data = data.as_ordered(ordered=False)
+        super().__init__(data, name=name)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2643,7 +2685,7 @@ def categories(self):
         """
         The categories of this categorical.
         """
-        return as_index(self._values.categories)
+        return self.dtype.categories
 
     def _is_boolean(self):
         return False
@@ -2813,7 +2855,7 @@ def __init__(
         copy: bool = False,
         name=None,
     ):
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
 
         if dtype is not None:
             dtype = cudf.dtype(dtype)
@@ -2860,7 +2902,7 @@ def __init__(
 
     @property
     def closed(self):
-        return self._values.dtype.closed
+        return self.dtype.closed
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -2942,7 +2984,7 @@ def _clean_nulls_from_index(self):
 
 @_cudf_nvtx_annotate
 def as_index(
-    arbitrary, nan_as_null=None, copy=False, name=no_default, dtype=None
+    arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
     """Create an Index from an arbitrary object
 
@@ -2992,6 +3034,10 @@ def as_index(
         - DatetimeIndex for Datetime input.
         - Index for all other inputs.
     """
+    if nan_as_null is no_default:
+        nan_as_null = (
+            False if cudf.get_option("mode.pandas_compatible") else None
+        )
 
     if name is no_default:
         name = getattr(arbitrary, "name", None)
@@ -3045,10 +3091,10 @@ def as_index(
     return idx
 
 
-def _setdefault_name(values, **kwargs):
-    if kwargs.get("name") is None:
-        kwargs["name"] = getattr(values, "name", None)
-    return kwargs
+def _getdefault_name(values, name):
+    if name is None:
+        return getattr(values, "name", None)
+    return name
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ca9d5590044..394904c5855 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,7 +8,6 @@
 import textwrap
 import warnings
 from collections import Counter, abc
-from functools import cached_property
 from typing import (
     Any,
     Callable,
@@ -56,7 +55,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_columns
+from cudf.core.index import Index, RangeIndex, _index_from_data
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -174,7 +173,7 @@ def _indices_from_labels(obj, labels):
 
         if isinstance(obj.index.dtype, cudf.CategoricalDtype):
             labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index._values.codes.dtype)
+            codes = labels.codes.astype(obj.index.codes.dtype)
             labels = cudf.core.column.build_categorical_column(
                 categories=labels.dtype.categories,
                 codes=codes,
@@ -195,7 +194,6 @@ def _get_label_range_or_mask(index, start, stop, step):
     if (
         not (start is None and stop is None)
         and type(index) is cudf.core.index.DatetimeIndex
-        and index.is_monotonic_increasing is False
     ):
         start = pd.to_datetime(start)
         stop = pd.to_datetime(stop)
@@ -206,8 +204,8 @@ def _get_label_range_or_mask(index, start, stop, step):
                 # when we have a non-monotonic datetime index, return
                 # values in the slice defined by index_of(start) and
                 # index_of(end)
-                start_loc = index.get_loc(start.to_datetime64())
-                stop_loc = index.get_loc(stop.to_datetime64()) + 1
+                start_loc = index.get_loc(start)
+                stop_loc = index.get_loc(stop) + 1
                 return slice(start_loc, stop_loc)
             else:
                 raise KeyError(
@@ -215,10 +213,19 @@ def _get_label_range_or_mask(index, start, stop, step):
                     "DatetimeIndexes with non-existing keys is not allowed.",
                 )
         elif start is not None:
-            boolean_mask = index >= start
+            if index.is_monotonic_increasing:
+                return index >= start
+            elif index.is_monotonic_decreasing:
+                return index <= start
+            else:
+                return index.find_label_range(slice(start, stop, step))
         else:
-            boolean_mask = index <= stop
-        return boolean_mask
+            if index.is_monotonic_increasing:
+                return index <= stop
+            elif index.is_monotonic_decreasing:
+                return index >= stop
+            else:
+                return index.find_label_range(slice(start, stop, step))
     else:
         return index.find_label_range(slice(start, stop, step))
 
@@ -282,11 +289,11 @@ def __init__(self, data=None, index=None):
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
-        return len(self._index)
+        return len(self.index)
 
     @property
     def _index_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self._index._data.names
+        return self.index._data.names
 
     @classmethod
     def _from_data(
@@ -300,7 +307,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        out = self._from_data(data, self._index)
+        out = self._from_data(data, self.index)
         out._data._level_names = self._data._level_names
         return out
 
@@ -331,7 +338,9 @@ def _from_columns_like_self(
         if index_names is not None:
             n_index_columns = len(index_names)
             data_columns = columns[n_index_columns:]
-            index = _index_from_columns(columns[:n_index_columns])
+            index = _index_from_data(
+                dict(enumerate(columns[:n_index_columns]))
+            )
             if isinstance(index, cudf.MultiIndex):
                 index.names = index_names
             else:
@@ -341,6 +350,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
 
         if index is not None:
+            # TODO: triage why using the setter here breaks dask_cuda.ProxifyHostFile
             frame._index = index
         return frame._copy_type_metadata(
             self,
@@ -358,7 +368,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Optional[Self]:
         if inplace:
-            self._index = result._index
+            self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
     # Scans
@@ -433,15 +443,15 @@ def _scan(self, op, axis=None, skipna=True):
                 # pandas returns an int64 dtype for all int or bool dtypes.
                 result_col = result_col.astype(np.int64)
             results[name] = getattr(result_col, op)()
-        return self._from_data(results, self._index)
+        return self._from_data(results, self.index)
 
     def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
-        if self._data.nrows > 0 and self._data.nrows != len(self._index):
+        if self._data.nrows > 0 and self._data.nrows != len(self.index):
             raise ValueError(
                 f"Length of values ({self._data.nrows}) does not "
-                f"match length of index ({len(self._index)})"
+                f"match length of index ({len(self.index)})"
             )
 
     @property
@@ -609,14 +619,14 @@ def copy(self, deep: bool = True) -> Self:
         return self._from_data(
             self._data.copy(deep=deep),
             # Indexes are immutable so copies can always be shallow.
-            self._index.copy(deep=False),
+            self.index.copy(deep=False),
         )
 
     @_cudf_nvtx_annotate
     def equals(self, other):  # noqa: D102
         if not super().equals(other):
             return False
-        return self._index.equals(other._index)
+        return self.index.equals(other.index)
 
     @property
     def index(self):
@@ -899,7 +909,7 @@ def replace(
         else:
             copy_data = self._data.copy(deep=True)
 
-        result = self._from_data(copy_data, self._index)
+        result = self._from_data(copy_data, self.index)
 
         return self._mimic_inplace(result, inplace=inplace)
 
@@ -1024,7 +1034,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
             name: col.clip(lower[i], upper[i])
             for i, (name, col) in enumerate(self._data.items())
         }
-        output = self._from_data(data, self._index)
+        output = self._from_data(data, self.index)
         output._copy_type_metadata(self, include_index=False)
         return self._mimic_inplace(output, inplace=inplace)
 
@@ -1530,11 +1540,6 @@ def median(
         >>> ser.median()
         17.0
 
-        .. pandas-compat::
-            **DataFrame.median, Series.median**
-
-            Parameters currently not supported are `level` and `numeric_only`.
-
         .. pandas-compat::
             **DataFrame.median, Series.median**
 
@@ -1906,13 +1911,15 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result_data = {}
-        for name, col in self._data.items():
-            try:
-                result_data[name] = col.nans_to_nulls()
-            except AttributeError:
-                result_data[name] = col.copy()
-        return self._from_data_like_self(result_data)
+        result = (
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col.copy()
+            for col in self._data.columns
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(result)
+        )
 
     def _copy_type_metadata(
         self,
@@ -1929,29 +1936,27 @@ def _copy_type_metadata(
         super()._copy_type_metadata(other, override_dtypes=override_dtypes)
         if (
             include_index
-            and self._index is not None
-            and other._index is not None
+            and self.index is not None
+            and other.index is not None
         ):
-            self._index._copy_type_metadata(other._index)
-            # When other._index is a CategoricalIndex, the current index
+            self.index._copy_type_metadata(other.index)
+            # When other.index is a CategoricalIndex, the current index
             # will be a NumericalIndex with an underlying CategoricalColumn
             # (the above _copy_type_metadata call will have converted the
             # column). Calling cudf.Index on that column generates the
             # appropriate index.
             if isinstance(
-                other._index, cudf.core.index.CategoricalIndex
-            ) and not isinstance(
-                self._index, cudf.core.index.CategoricalIndex
-            ):
-                self._index = cudf.Index(
-                    cast("cudf.Index", self._index)._column,
-                    name=self._index.name,
+                other.index, cudf.core.index.CategoricalIndex
+            ) and not isinstance(self.index, cudf.core.index.CategoricalIndex):
+                self.index = cudf.Index(
+                    cast("cudf.Index", self.index)._column,
+                    name=self.index.name,
                 )
-            elif isinstance(other._index, cudf.MultiIndex) and not isinstance(
-                self._index, cudf.MultiIndex
+            elif isinstance(other.index, cudf.MultiIndex) and not isinstance(
+                self.index, cudf.MultiIndex
             ):
-                self._index = cudf.MultiIndex._from_data(
-                    self._index._data, name=self._index.name
+                self.index = cudf.MultiIndex._from_data(
+                    self.index._data, name=self.index.name
                 )
         return self
 
@@ -2011,8 +2016,8 @@ def interpolate(
 
         data = self
 
-        if not isinstance(data._index, cudf.RangeIndex):
-            perm_sort = data._index.argsort()
+        if not isinstance(data.index, cudf.RangeIndex):
+            perm_sort = data.index.argsort()
             data = data._gather(
                 GatherMap.from_column_unchecked(
                     cudf.core.column.as_column(perm_sort),
@@ -2034,13 +2039,13 @@ def interpolate(
                 col = col.astype("float64").fillna(np.nan)
 
             # Interpolation methods may or may not need the index
-            columns[colname] = interpolator(col, index=data._index)
+            columns[colname] = interpolator(col, index=data.index)
 
-        result = self._from_data(columns, index=data._index)
+        result = self._from_data(columns, index=data.index)
 
         return (
             result
-            if isinstance(data._index, cudf.RangeIndex)
+            if isinstance(data.index, cudf.RangeIndex)
             # TODO: This should be a scatter, avoiding an argsort.
             else result._gather(
                 GatherMap.from_column_unchecked(
@@ -2064,7 +2069,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             col.shift(periods, fill_value) for col in self._columns
         )
         return self.__class__._from_data(
-            zip(self._column_names, data_columns), self._index
+            zip(self._column_names, data_columns), self.index
         )
 
     @_cudf_nvtx_annotate
@@ -2248,7 +2253,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         if not copy:
             raise ValueError("Truncating with copy=False is not supported.")
         axis = self._get_axis_from_axis_arg(axis)
-        ax = self._index if axis == 0 else self._data.to_pandas_index()
+        ax = self.index if axis == 0 else self._data.to_pandas_index()
 
         if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
             raise ValueError("truncate requires a sorted index")
@@ -2267,7 +2272,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         slicer[axis] = slice(before, after)
         return self.loc[tuple(slicer)].copy()
 
-    @cached_property
+    @property
     def loc(self):
         """Select rows and columns by label or boolean mask.
 
@@ -2333,7 +2338,7 @@ def loc(self):
         """
         return self._loc_indexer_type(self)
 
-    @cached_property
+    @property
     def iloc(self):
         """Select values by position.
 
@@ -2579,7 +2584,7 @@ def scale(self):
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
-        scaled._index = self._index.copy(deep=False)
+        scaled.index = self.index.copy(deep=False)
         return scaled
 
     @_cudf_nvtx_annotate
@@ -2913,14 +2918,14 @@ def _gather(
             raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
             libcudf.copying.gather(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 gather_map.column,
                 nullify=gather_map.nullify,
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
 
     def _slice(self, arg: slice, keep_index: bool = True) -> Self:
@@ -2994,7 +2999,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self._index._data.columns
+                self.index._data.columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3003,7 +3008,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
             self._column_names,
-            None if has_range_index or not keep_index else self._index.names,
+            None if has_range_index or not keep_index else self.index.names,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3022,7 +3027,7 @@ def _positions_from_column_names(
         indices returned corresponds to the column order in this Frame.
         """
         num_index_columns = (
-            len(self._index._data) if offset_by_index_columns else 0
+            len(self.index._data) if offset_by_index_columns else 0
         )
         return [
             i + num_index_columns
@@ -3067,13 +3072,13 @@ def drop_duplicates(
             libcudf.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
-                else list(self._index._columns + self._columns),
+                else list(self.index._columns + self._columns),
                 keys=keys,
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
             self._column_names,
-            self._index.names if not ignore_index else None,
+            self.index.names if not ignore_index else None,
         )
 
     @_cudf_nvtx_annotate
@@ -3191,12 +3196,12 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self._index._data.columns if keep_index else ()),
+                    *(self.index._data.columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3208,7 +3213,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self._index._data.columns if keep_index else []),
+                *(self.index._data.columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3218,7 +3223,7 @@ def _split(self, splits, keep_index=True):
             self._from_columns_like_self(
                 columns_split[i],
                 self._column_names,
-                self._index.names if keep_index else None,
+                self.index.names if keep_index else None,
             )
             for i in range(len(splits) + 1)
         ]
@@ -3238,12 +3243,12 @@ def fillna(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
-        old_index = self._index
+        old_index = self.index
         ret = super().fillna(value, method, axis, inplace, limit)
         if inplace:
-            self._index = old_index
+            self.index = old_index
         else:
-            ret._index = old_index
+            ret.index = old_index
         return ret
 
     @_cudf_nvtx_annotate
@@ -3473,7 +3478,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
+        result = cudf.Series._from_data({None: col}, self.index)
 
         return result
 
@@ -3700,12 +3705,12 @@ def _reindex(
 
         df = self
         if index is not None:
-            if not df._index.is_unique:
+            if not df.index.is_unique:
                 raise ValueError(
                     "cannot reindex on an axis with duplicate labels"
                 )
             index = cudf.core.index.as_index(
-                index, name=getattr(index, "name", self._index.name)
+                index, name=getattr(index, "name", self.index.name)
             )
 
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
@@ -3733,7 +3738,7 @@ def _reindex(
                         else name: col
                         for name, col in df._data.items()
                     },
-                    index=df._index,
+                    index=df.index,
                 )
                 df = lhs.join(rhs, how="left", sort=True)
                 # double-argsort to map back from sorted to unsorted positions
@@ -3909,7 +3914,7 @@ def round(self, decimals=0, how="half_even"):
                 multiindex=self._data.multiindex,
                 level_names=self._data.level_names,
             ),
-            index=self._index,
+            index=self.index,
         )
 
     def resample(
@@ -4261,7 +4266,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self._index._data.columns, *data_columns],
+                [*self.index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4269,7 +4274,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
                 thresh=thresh,
             ),
             self._column_names,
-            self._index.names,
+            self.index.names,
         )
 
     def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
@@ -4286,13 +4291,13 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
             )
         return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 boolean_mask.column,
             ),
             column_names=self._column_names,
-            index_names=self._index.names if keep_index else None,
+            index_names=self.index.names if keep_index else None,
         )
 
     def take(self, indices, axis=0):
@@ -4341,34 +4346,27 @@ def take(self, indices, axis=0):
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
-        if level is not None and not isinstance(level, (tuple, list)):
-            level = (level,)
-        _check_duplicate_level_names(level, self._index.names)
+        if level is not None:
+            if (
+                isinstance(level, int)
+                and level > 0
+                and not isinstance(self.index, MultiIndex)
+            ):
+                raise IndexError(
+                    f"Too many levels: Index has only 1 level, not {level + 1}"
+                )
+            if not isinstance(level, (tuple, list)):
+                level = (level,)
+        _check_duplicate_level_names(level, self.index.names)
 
-        # Split the columns in the index into data and index columns
-        (
-            data_columns,
-            index_columns,
-            data_names,
-            index_names,
-        ) = self._index._split_columns_by_levels(level)
-        if index_columns:
-            index = _index_from_columns(
-                index_columns,
-                name=self._index.name,
-            )
-            if isinstance(index, MultiIndex):
-                index.names = index_names
-            else:
-                index.name = index_names[0]
-        else:
+        index = self.index._new_index_for_reset_index(level, self.index.name)
+        if index is None:
             index = RangeIndex(len(self))
-
         if drop:
             return self._data, index
 
         new_column_data = {}
-        for name, col in zip(data_names, data_columns):
+        for name, col in self.index._columns_for_reset_index(level):
             if name == "index" and "index" in self._data:
                 name = "level_0"
             name = (
@@ -4395,7 +4393,7 @@ def _first_or_last(
         self, offset, idx: int, op: Callable, side: str, slice_func: Callable
     ) -> "IndexedFrame":
         """Shared code path for ``first`` and ``last``."""
-        if not isinstance(self._index, cudf.core.index.DatetimeIndex):
+        if not isinstance(self.index, cudf.core.index.DatetimeIndex):
             raise TypeError("'first' only supports a DatetimeIndex index.")
         if not isinstance(offset, str):
             raise NotImplementedError(
@@ -4407,20 +4405,20 @@ def _first_or_last(
 
         pd_offset = pd.tseries.frequencies.to_offset(offset)
         to_search = op(
-            pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset
+            pd.Timestamp(self.index._column.element_indexing(idx)), pd_offset
         )
         if (
             idx == 0
             and not isinstance(pd_offset, pd.tseries.offsets.Tick)
-            and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
+            and pd_offset.is_on_offset(pd.Timestamp(self.index[0]))
         ):
             # Special handle is required when the start time of the index
             # is on the end of the offset. See pandas gh29623 for detail.
             to_search = to_search - pd_offset.base
             return self.loc[:to_search]
-        needle = as_column(to_search, dtype=self._index.dtype)
+        needle = as_column(to_search, dtype=self.index.dtype)
         end_point = int(
-            self._index._column.searchsorted(
+            self.index._column.searchsorted(
                 needle, side=side
             ).element_indexing(0)
         )
@@ -4803,7 +4801,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                     name: (col, None, False, None)
                     for name, col in self._data.items()
                 }
-                index = self._index
+                index = self.index
 
             data = self._apply_cupy_ufunc_to_operands(
                 ufunc, cupy_func, inputs, **kwargs
@@ -4879,13 +4877,16 @@ def repeat(self, repeats, axis=None):
         1    2
         dtype: int64
         """
-        return self._from_columns_like_self(
+        res = self._from_columns_like_self(
             Frame._repeat(
-                [*self._index._data.columns, *self._columns], repeats, axis
+                [*self.index._data.columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
         )
+        if isinstance(res.index, cudf.DatetimeIndex):
+            res.index._freq = None
+        return res
 
     def astype(
         self,
@@ -5009,7 +5010,7 @@ def astype(
                 raise e
             return self
 
-        return self._from_data(data, index=self._index)
+        return self._from_data(data, index=self.index)
 
     @_cudf_nvtx_annotate
     def drop(
@@ -5218,8 +5219,7 @@ def drop(
                 columns = _get_host_unique(columns)
                 _drop_columns(dropped, columns, errors)
 
-            out._data = dropped._data
-            out._index = dropped._index
+            out._mimic_inplace(dropped, inplace=True)
 
         if not inplace:
             return out
@@ -5232,18 +5232,18 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         # exploded and will be replaced with a `RangeIndex`.
         if not isinstance(self._data[explode_column].dtype, ListDtype):
             data = self._data.copy(deep=True)
-            idx = None if ignore_index else self._index.copy(deep=True)
+            idx = None if ignore_index else self.index.copy(deep=True)
             return self.__class__._from_data(data, index=idx)
 
         column_index = self._column_names.index(explode_column)
-        if not ignore_index and self._index is not None:
-            index_offset = self._index.nlevels
+        if not ignore_index and self.index is not None:
+            index_offset = self.index.nlevels
         else:
             index_offset = 0
 
         exploded = libcudf.lists.explode_outer(
             [
-                *(self._index._data.columns if not ignore_index else ()),
+                *(self.index._data.columns if not ignore_index else ()),
                 *self._columns,
             ],
             column_index + index_offset,
@@ -5290,7 +5290,7 @@ def tile(self, count):
         """
         return self._from_columns_like_self(
             libcudf.reshape.tile(
-                [*self._index._columns, *self._columns], count
+                [*self.index._columns, *self._columns], count
             ),
             column_names=self._column_names,
             index_names=self._index_names,
@@ -6271,7 +6271,7 @@ def rank(
 
         return self.__class__._from_data(
             dict(zip(source._column_names, result_columns)),
-            index=source._index,
+            index=source.index,
         ).astype(np.float64)
 
     def convert_dtypes(
@@ -6311,7 +6311,12 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            normalize_token(self._dtypes),
+            str(self._dtypes),
+            *[
+                normalize_token(cat.categories)
+                for cat in self._dtypes.values()
+                if cat == "category"
+            ],
             normalize_token(self.index),
             normalize_token(self.hash_values().values_host),
         ]
@@ -6498,7 +6503,7 @@ def _is_series(obj):
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
 @_cudf_nvtx_annotate
@@ -6511,7 +6516,7 @@ def _drop_rows_by_labels(
     """Remove rows specified by `labels`.
 
     If `errors="raise"`, an error is raised if some items in `labels` do not
-    exist in `obj._index`.
+    exist in `obj.index`.
 
     Will raise if level(int) is greater or equal to index nlevels.
     """
@@ -6532,17 +6537,17 @@ def _drop_rows_by_labels(
         if isinstance(level, int):
             ilevel = level
         else:
-            ilevel = obj._index.names.index(level)
+            ilevel = obj.index.names.index(level)
 
         # 1. Merge Index df and data df along column axis:
-        # | id | ._index df | data column(s) |
-        idx_nlv = obj._index.nlevels
-        working_df = obj._index.to_frame(index=False)
+        # | id | .index df | data column(s) |
+        idx_nlv = obj.index.nlevels
+        working_df = obj.index.to_frame(index=False)
         working_df.columns = list(range(idx_nlv))
         for i, col in enumerate(obj._data):
             working_df[idx_nlv + i] = obj._data[col]
         # 2. Set `level` as common index:
-        # | level | ._index df w/o level | data column(s) |
+        # | level | .index df w/o level | data column(s) |
         working_df = working_df.set_index(level)
 
         # 3. Use "leftanti" join to drop
@@ -6553,11 +6558,11 @@ def _drop_rows_by_labels(
 
         # 4. Reconstruct original layout, and rename
         join_res._insert(
-            ilevel, name=join_res._index.name, value=join_res._index
+            ilevel, name=join_res.index.name, value=join_res.index
         )
 
         midx = cudf.MultiIndex.from_frame(
-            join_res.iloc[:, 0:idx_nlv], names=obj._index.names
+            join_res.iloc[:, 0:idx_nlv], names=obj.index.names
         )
 
         if isinstance(obj, cudf.Series):
@@ -6589,7 +6594,7 @@ def _drop_rows_by_labels(
         # Join changes the index to common type,
         # but we need to preserve the type of
         # index being returned, Hence this type-cast.
-        res._index = res.index.astype(obj.index.dtype)
+        res.index = res.index.astype(obj.index.dtype)
         return res
 
 
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 6a619945e75..05cbb4429b9 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -43,10 +43,10 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
 
 class _IndexIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
-        return obj._index._data[self.name]
+        return obj.index._data[self.name]
 
     def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj._index._data.set_by_label(self.name, value, validate=validate)
+        obj.index._data.set_by_label(self.name, value, validate=validate)
 
 
 def _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 1ef2915bc59..da999441ca3 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -373,10 +373,10 @@ def _merge_results(
         index: Optional[cudf.BaseIndex]
         if self._using_right_index:
             # right_index and left_on
-            index = left_result._index
+            index = left_result.index
         elif self._using_left_index:
             # left_index and right_on
-            index = right_result._index
+            index = right_result.index
         else:
             index = None
 
@@ -400,7 +400,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: List[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result._index._data.columns)
+            by.extend(result.index._data.columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
@@ -408,8 +408,8 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         if by:
             keep_index = self._using_left_index or self._using_right_index
             if keep_index:
-                to_sort = [*result._index._columns, *result._columns]
-                index_names = result._index.names
+                to_sort = [*result.index._columns, *result._columns]
+                index_names = result.index.names
             else:
                 to_sort = [*result._columns]
                 index_names = None
@@ -547,4 +547,4 @@ class MergeSemi(Merge):
 
     def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame):
         # semi-join result includes only lhs columns
-        return lhs._data, lhs._index
+        return lhs._data, lhs.index
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 019daacddba..c149a1028a0 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,6 +8,7 @@
 import pickle
 import warnings
 from collections import abc
+from collections.abc import Generator
 from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
@@ -23,6 +24,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
+from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -561,7 +563,7 @@ def levels(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
@@ -1858,11 +1860,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             dtype=libcudf.types.size_type_dtype,
         )
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             target = cudf.MultiIndex.from_tuples(target)
         except TypeError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
@@ -1892,7 +1894,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
@@ -2051,43 +2053,68 @@ def _copy_type_metadata(
         return res
 
     @_cudf_nvtx_annotate
-    def _split_columns_by_levels(self, levels):
+    def _split_columns_by_levels(
+        self, levels: tuple, *, in_levels: bool
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
         # only be specified by 0, 1, not "None".
-
-        if levels is None:
-            return (
-                list(self._data.columns),
-                [],
-                [
-                    f"level_{i}" if name is None else name
-                    for i, name in enumerate(self.names)
-                ],
-                [],
-            )
-
-        # Normalize named levels into indices
         level_names = list(self.names)
         level_indices = {
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-
-        # Split the columns
-        data_columns, index_columns = [], []
-        data_names, index_names = [], []
         for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
-            if i in level_indices:
+            if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
-                data_columns.append(col)
-                data_names.append(name)
-            else:
-                index_columns.append(col)
-                index_names.append(name)
-        return data_columns, index_columns, data_names, index_names
+                yield name, col
+            elif not in_levels and i not in level_indices:
+                yield name, col
+
+    @_cudf_nvtx_annotate
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        if levels is None:
+            return None
+
+        index_columns, index_names = [], []
+        for name, col in self._split_columns_by_levels(
+            levels, in_levels=False
+        ):
+            index_columns.append(col)
+            index_names.append(name)
+
+        if not index_columns:
+            # None is caught later to return RangeIndex
+            return None
+
+        index = cudf.core.index._index_from_data(
+            dict(enumerate(index_columns)),
+            name=name,
+        )
+        if isinstance(index, type(self)):
+            index.names = index_names
+        else:
+            index.name = index_names[0]
+        return index
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        if levels is None:
+            for i, (col, name) in enumerate(
+                zip(self._data.columns, self.names)
+            ):
+                yield f"level_{i}" if name is None else name, col
+        else:
+            yield from self._split_columns_by_levels(levels, in_levels=True)
 
     def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        return self._from_data(
+            self._data._from_columns_like_self(
+                super()._repeat([*self._columns], repeats, axis)
+            )
         )
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 1a79b122561..cdd4ec6f8e5 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -30,7 +30,6 @@
     SeriesGroupBy,
     _Grouping,
 )
-from cudf.core.tools.datetimes import _offset_alias_to_code, _unit_dtype_map
 
 
 class _Resampler(GroupBy):
@@ -247,47 +246,46 @@ def _handle_frequency_grouper(self, by):
         # column to have the same dtype, so we compute a `result_type`
         # and cast them both to that type.
         try:
-            result_type = np.dtype(
-                _unit_dtype_map[_offset_alias_to_code[offset.name]]
-            )
-        except KeyError:
+            result_type = np.dtype(f"datetime64[{offset.rule_code}]")
+            # TODO: Ideally, we can avoid one cast by having `date_range`
+            # generate timestamps of a given dtype.  Currently, it can
+            # only generate timestamps with 'ns' precision
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
+        except TypeError:
             # unsupported resolution (we don't support resolutions >s)
             # fall back to using datetime64[s]
             result_type = np.dtype("datetime64[s]")
-
-        # TODO: Ideally, we can avoid one cast by having `date_range`
-        # generate timestamps of a given dtype.  Currently, it can
-        # only generate timestamps with 'ns' precision
-        key_column = key_column.astype(result_type)
-        bin_labels = bin_labels.astype(result_type)
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
         bin_numbers = cudf._lib.labeling.label_bins(
-            key_column,
-            left_edges=bin_labels[:-1]._column,
+            cast_key_column,
+            left_edges=cast_bin_labels[:-1]._column,
             left_inclusive=(closed == "left"),
-            right_edges=bin_labels[1:]._column,
+            right_edges=cast_bin_labels[1:]._column,
             right_inclusive=(closed == "right"),
         )
 
         if label == "right":
-            bin_labels = bin_labels[1:]
+            cast_bin_labels = cast_bin_labels[1:]
         else:
-            bin_labels = bin_labels[:-1]
+            cast_bin_labels = cast_bin_labels[:-1]
 
         # if we have more labels than bins, remove the extras labels:
         nbins = bin_numbers.max() + 1
-        if len(bin_labels) > nbins:
-            bin_labels = bin_labels[:nbins]
+        if len(cast_bin_labels) > nbins:
+            cast_bin_labels = cast_bin_labels[:nbins]
 
-        bin_labels.name = self.names[0]
-        self.bin_labels = bin_labels
+        cast_bin_labels.name = self.names[0]
+        self.bin_labels = cast_bin_labels
 
         # replace self._key_columns with the binned key column:
         self._key_columns = [
-            bin_labels._gather(bin_numbers, check_bounds=False)._column.astype(
-                result_type
-            )
+            cast_bin_labels._gather(
+                bin_numbers, check_bounds=False
+            )._column.astype(result_type)
         ]
 
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 2ef39e9357d..d4772d5b4c2 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -2,10 +2,8 @@
 
 import itertools
 import warnings
-from collections import abc
 from typing import Dict, Optional
 
-import cupy
 import numpy as np
 import pandas as pd
 
@@ -23,7 +21,8 @@
 
 
 def _align_objs(objs, how="outer", sort=None):
-    """Align a set of Series or Dataframe objects.
+    """
+    Align a set of Series or Dataframe objects.
 
     Parameters
     ----------
@@ -31,6 +30,7 @@ def _align_objs(objs, how="outer", sort=None):
     how : How to handle indexes on other axis (or axes),
     similar to join in concat
     sort : Whether to sort the resulting Index
+
     Returns
     -------
     A list of reindexed and aligned objects
@@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list or dictionary of DataFrame, Series, or Index
     axis : {0/'index', 1/'columns'}, default 0
         The axis to concatenate along.
+        `axis=1` must be passed if a dictionary is passed.
     join : {'inner', 'outer'}, default 'outer'
         How to handle indexes on other axis (or axes).
     ignore_index : bool, default False
@@ -229,27 +230,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
       letter  number  animal    name
     0      a       1    bird   polly
     1      b       2  monkey  george
+
+    Combine a dictionary of DataFrame objects horizontally:
+
+    >>> d = {'first': df1, 'second': df2}
+    >>> cudf.concat(d, axis=1)
+      first           second
+      letter  number  letter  number
+    0      a       1       c       3
+    1      b       2       d       4
     """
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
-    objs = [obj for obj in objs if obj is not None]
-
-    if not objs:
-        raise ValueError("All objects passed were None")
-
     axis = _AXIS_MAP.get(axis, None)
     if axis is None:
         raise ValueError(
             f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
         )
 
+    if isinstance(objs, dict):
+        if axis != 1:
+            raise NotImplementedError(
+                f"Can only concatenate dictionary input along axis=1, not {axis}"
+            )
+        objs = {k: obj for k, obj in objs.items() if obj is not None}
+        keys = list(objs)
+        objs = list(objs.values())
+        if any(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "cannot concatenate a dictionary containing indices"
+            )
+    else:
+        objs = [obj for obj in objs if obj is not None]
+        keys = None
+
+    if not objs:
+        raise ValueError("All objects passed were None")
+
+    # Retrieve the base types of `objs`. In order to support sub-types
+    # and object wrappers, we use `isinstance()` instead of comparing
+    # types directly
+    allowed_typs = {
+        cudf.Series,
+        cudf.DataFrame,
+        cudf.BaseIndex,
+    }
+    if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
+        raise TypeError(
+            f"can only concatenate objects which are instances of "
+            f"{allowed_typs}, instead received {[type(o) for o in objs]}"
+        )
+
+    if any(isinstance(o, cudf.BaseIndex) for o in objs):
+        if not all(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "when concatenating indices you must provide ONLY indices"
+            )
+
+    only_series = all(isinstance(o, cudf.Series) for o in objs)
+
     # Return for single object
     if len(objs) == 1:
         obj = objs[0]
-
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame._from_data(
@@ -288,6 +333,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = cudf.DataFrame._from_data(
                     data, index=obj.index.copy(deep=True)
                 )
+                if keys is not None:
+                    if isinstance(result, cudf.DataFrame):
+                        k = keys[0]
+                        result.columns = cudf.MultiIndex.from_tuples(
+                            [
+                                (k, *c) if isinstance(c, tuple) else (k, c)
+                                for c in result._column_names
+                            ]
+                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -295,27 +349,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             return result.sort_index(axis=(1 - axis)) if sort else result
 
-    # Retrieve the base types of `objs`. In order to support sub-types
-    # and object wrappers, we use `isinstance()` instead of comparing
-    # types directly
-    typs = set()
-    for o in objs:
-        if isinstance(o, cudf.MultiIndex):
-            typs.add(cudf.MultiIndex)
-        elif isinstance(o, cudf.BaseIndex):
-            typs.add(type(o))
-        elif isinstance(o, cudf.DataFrame):
-            typs.add(cudf.DataFrame)
-        elif isinstance(o, cudf.Series):
-            typs.add(cudf.Series)
-        else:
-            raise TypeError(f"cannot concatenate object of type {type(o)}")
-
-    allowed_typs = {cudf.Series, cudf.DataFrame}
-
     # when axis is 1 (column) we can concat with Series and Dataframes
     if axis == 1:
-        if not typs.issubset(allowed_typs):
+        if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
@@ -351,35 +387,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        for o in objs:
-            for name, col in o._data.items():
-                if name in df._data:
-                    raise NotImplementedError(
-                        f"A Column with duplicate name found: {name}, cuDF "
-                        f"doesn't support having multiple columns with "
-                        f"same names yet."
-                    )
-                if empty_inner:
-                    # if join is inner and it contains an empty df
-                    # we return an empty df, hence creating an empty
-                    # column with dtype metadata retained.
-                    df[name] = cudf.core.column.column_empty_like(
-                        col, newsize=0
-                    )
-                else:
-                    df[name] = col
-
-        result_columns = (
-            objs[0]
-            ._data.to_pandas_index()
-            .append([obj._data.to_pandas_index() for obj in objs[1:]])
-        )
+        if keys is None:
+            for o in objs:
+                for name, col in o._data.items():
+                    if name in df._data:
+                        raise NotImplementedError(
+                            f"A Column with duplicate name found: {name}, cuDF "
+                            f"doesn't support having multiple columns with "
+                            f"same names yet."
+                        )
+                    if empty_inner:
+                        # if join is inner and it contains an empty df
+                        # we return an empty df, hence creating an empty
+                        # column with dtype metadata retained.
+                        df[name] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[name] = col
+
+            result_columns = (
+                objs[0]
+                ._data.to_pandas_index()
+                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                .unique()
+            )
 
-        if ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = pd.RangeIndex(len(result_columns.unique()))
+        # need to create a MultiIndex column
         else:
+            # All levels in the multiindex label must have the same type
+            has_multiple_level_types = (
+                len({type(name) for o in objs for name in o._data.keys()}) > 1
+            )
+            if has_multiple_level_types:
+                raise NotImplementedError(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            for k, o in zip(keys, objs):
+                for name, col in o._data.items():
+                    # if only series, then only keep keys as column labels
+                    # if the existing column is multiindex, prepend it
+                    # to handle cases where dfs and srs are concatenated
+                    if only_series:
+                        col_label = k
+                    elif isinstance(name, tuple):
+                        col_label = (k, *name)
+                    else:
+                        col_label = (k, name)
+                    if empty_inner:
+                        df[col_label] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[col_label] = col
+
+        if keys is None:
             df.columns = result_columns.unique()
+            if ignore_index:
+                df.columns = cudf.RangeIndex(len(result_columns.unique()))
+        elif ignore_index:
+            # with ignore_index the column names change to numbers
+            df.columns = cudf.RangeIndex(len(result_columns))
+        elif not only_series:
+            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -389,18 +461,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
-    typ = list(typs)[0]
-    if len(typs) > 1:
-        if allowed_typs == typs:
-            # This block of code will run when `objs` has
-            # both Series & DataFrame kind of inputs.
-            _normalize_series_and_dataframe(objs, axis=axis)
-            typ = cudf.DataFrame
-        else:
-            raise TypeError(
-                f"`concat` cannot concatenate objects of "
-                f"types: {sorted([t.__name__ for t in typs])}."
-            )
+    typ = type(objs[0])
+    if len({type(o) for o in objs}) > 1:
+        _normalize_series_and_dataframe(objs, axis=axis)
+        typ = cudf.DataFrame
 
     if typ is cudf.DataFrame:
         old_objs = objs
@@ -524,7 +588,7 @@ def melt(
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(id_vars):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -538,7 +602,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(value_vars):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
@@ -592,21 +656,22 @@ def _tile(A, reps):
     # Step 2: add variable
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
-    temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N))
 
     if not var_name:
         var_name = "variable"
 
-    mdata[var_name] = cudf.Series(
-        cudf.core.column.build_categorical_column(
-            categories=value_vars,
-            codes=temp._column,
-            mask=temp._column.base_mask,
-            size=temp._column.size,
-            offset=temp._column.offset,
-            ordered=False,
+    if not value_vars:
+        # TODO: Use frame._data.label_dtype when it's more consistently set
+        var_data = cudf.Series(
+            value_vars, dtype=frame._data.to_pandas_index().dtype
         )
-    )
+    else:
+        var_data = (
+            cudf.Series(value_vars)
+            .take(np.repeat(np.arange(nval, dtype=dtype), N))
+            .reset_index(drop=True)
+        )
+    mdata[var_name] = var_data
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
@@ -771,7 +836,7 @@ def get_dummies(
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df._index)
+            return cudf.DataFrame._from_data(result_data, index=df.index)
     else:
         ser = cudf.Series(df)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
@@ -782,7 +847,7 @@ def get_dummies(
             prefix_sep=prefix_sep,
             dtype=dtype,
         )
-        return cudf.DataFrame._from_data(data, index=ser._index)
+        return cudf.DataFrame._from_data(data, index=ser.index)
 
 
 def _merge_sorted(
@@ -834,7 +899,7 @@ def _merge_sorted(
         raise ValueError("`by_index` and `ignore_index` cannot both be True")
 
     if by_index:
-        key_columns_indices = list(range(0, objs[0]._index.nlevels))
+        key_columns_indices = list(range(0, objs[0].index.nlevels))
     else:
         if keys is None:
             key_columns_indices = list(range(0, objs[0]._num_columns))
@@ -844,12 +909,12 @@ def _merge_sorted(
             ]
         if not ignore_index:
             key_columns_indices = [
-                idx + objs[0]._index.nlevels for idx in key_columns_indices
+                idx + objs[0].index.nlevels for idx in key_columns_indices
             ]
 
     columns = [
         [
-            *(obj._index._data.columns if not ignore_index else ()),
+            *(obj.index._data.columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f7d05e53ce7..29460d8c67e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype):
 
         if dtype is None:
             if not valid:
+                if value is NaT:
+                    value = value.to_numpy()
+
                 if isinstance(value, (np.datetime64, np.timedelta64)):
                     unit, _ = np.datetime_data(value)
                     if unit == "generic":
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 275dc664175..41fbf269699 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -39,11 +39,9 @@
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
@@ -205,19 +203,10 @@ def __setitem__(self, key, value):
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
             if (
-                not isinstance(
-                    self._frame._column,
-                    (
-                        cudf.core.column.DatetimeColumn,
-                        cudf.core.column.TimeDeltaColumn,
-                    ),
-                )
+                self._frame.dtype.kind not in "mM"
                 and cudf.utils.utils._isnat(value)
                 and not (
-                    isinstance(
-                        self._frame._column, cudf.core.column.StringColumn
-                    )
-                    and isinstance(value, str)
+                    self._frame.dtype == "object" and isinstance(value, str)
                 )
             ):
                 raise MixedTypeError(
@@ -226,14 +215,10 @@ def __setitem__(self, key, value):
                 )
             elif (
                 not (
-                    is_float_dtype(self._frame._column.dtype)
+                    self._frame.dtype.kind == "f"
                     or (
-                        isinstance(
-                            self._frame._column.dtype, cudf.CategoricalDtype
-                        )
-                        and is_float_dtype(
-                            self._frame._column.dtype.categories.dtype
-                        )
+                        isinstance(self._frame.dtype, cudf.CategoricalDtype)
+                        and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
                 and isinstance(value, (np.float32, np.float64))
@@ -241,40 +226,37 @@ def __setitem__(self, key, value):
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"non-float dtype={self._frame._column.dtype}"
+                    f"non-float dtype={self._frame.dtype}"
                 )
             elif (
-                is_bool_dtype(self._frame._column.dtype)
+                self._frame.dtype.kind == "b"
                 and not is_bool_dtype(value)
                 and value not in {None, cudf.NA}
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"bool dtype={self._frame._column.dtype}"
+                    f"bool dtype={self._frame.dtype}"
                 )
         elif not (
             isinstance(value, (list, dict))
             and isinstance(
-                self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
+                self._frame.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
             value = as_column(value)
 
         if (
-            (
-                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
-                or is_string_dtype(self._frame._column.dtype)
-            )
+            (self._frame.dtype.kind in "uifb" or self._frame.dtype == "object")
             and hasattr(value, "dtype")
-            and _is_non_decimal_numeric_dtype(value.dtype)
+            and value.dtype.kind in "uifb"
         ):
             # normalize types if necessary:
             # In contrast to Column.__setitem__ (which downcasts the value to
             # the dtype of the column) here we upcast the series to the
             # larger data type mimicking pandas
-            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            to_dtype = np.result_type(value.dtype, self._frame.dtype)
             value = value.astype(to_dtype)
-            if to_dtype != self._frame._column.dtype:
+            if to_dtype != self._frame.dtype:
                 # Do not remove until pandas-3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -283,7 +265,7 @@ def __setitem__(self, key, value):
                     f"Setting an item of incompatible dtype is deprecated "
                     "and will raise in a future error of pandas. "
                     f"Value '{value}' has dtype incompatible with "
-                    f"{self._frame._column.dtype}, "
+                    f"{self._frame.dtype}, "
                     "please explicitly cast to a compatible dtype first.",
                     FutureWarning,
                 )
@@ -314,7 +296,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
             result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
-                and len(arg) == self._frame._index.nlevels
+                and len(arg) == self._frame.index.nlevels
                 and not any(isinstance(x, slice) for x in arg)
             ):
                 result = result.iloc[0]
@@ -336,27 +318,27 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                # TODO: Modifying index in place is bad because
-                # our index are immutable, but columns are not (which
-                # means our index are mutable with internal APIs).
-                # Get rid of the deep copy once columns too are
-                # immutable.
-                idx_copy = self._frame._index.copy(deep=True)
-                if (
-                    isinstance(idx_copy, cudf.RangeIndex)
-                    and isinstance(key, int)
-                    and (key == idx_copy[-1] + idx_copy.step)
-                ):
-                    idx_copy = cudf.RangeIndex(
-                        start=idx_copy.start,
-                        stop=idx_copy.stop + idx_copy.step,
-                        step=idx_copy.step,
-                        name=idx_copy.name,
-                    )
+                idx = self._frame.index
+                if isinstance(idx, cudf.RangeIndex):
+                    if isinstance(key, int) and (key == idx[-1] + idx.step):
+                        idx_copy = cudf.RangeIndex(
+                            start=idx.start,
+                            stop=idx.stop + idx.step,
+                            step=idx.step,
+                            name=idx.name,
+                        )
+                    else:
+                        idx_copy = idx._as_int_index()
+                        _append_new_row_inplace(idx_copy._column, key)
                 else:
-                    if isinstance(idx_copy, cudf.RangeIndex):
-                        idx_copy = idx_copy._as_int_index()
-                    _append_new_row_inplace(idx_copy._values, key)
+                    # TODO: Modifying index in place is bad because
+                    # our index are immutable, but columns are not (which
+                    # means our index are mutable with internal APIs).
+                    # Get rid of the deep copy once columns too are
+                    # immutable.
+                    idx_copy = idx.copy(deep=True)
+                    _append_new_row_inplace(idx_copy._column, key)
+
                 self._frame._index = idx_copy
                 _append_new_row_inplace(self._frame._column, value)
                 return
@@ -599,8 +581,10 @@ def __init__(
         dtype=None,
         name=None,
         copy=False,
-        nan_as_null=True,
+        nan_as_null=no_default,
     ):
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         index_from_data = None
         name_from_data = None
         if data is None:
@@ -698,7 +682,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
-        return item in self._index
+        return item in self.index
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -848,7 +832,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -866,7 +850,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -1011,7 +995,7 @@ def reindex(self, *args, **kwargs):
                     "'index' passed as both positional and keyword argument"
                 )
         else:
-            index = kwargs.get("index", self._index)
+            index = kwargs.get("index", self.index)
 
         name = self.name or 0
         series = self._reindex(
@@ -1156,7 +1140,7 @@ def to_frame(self, name=None):
     @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
-            self._index.memory_usage() if index else 0
+            self.index.memory_usage() if index else 0
         )
 
     @_cudf_nvtx_annotate
@@ -1407,34 +1391,23 @@ def __repr__(self):
                     cudf.core.dtypes.DecimalDtype,
                 ),
             )
-        ) or isinstance(
-            preprocess._column,
-            cudf.core.column.timedelta.TimeDeltaColumn,
-        ):
+        ) or preprocess.dtype.kind == "m":
             fill_value = (
                 str(cudf.NaT)
-                if isinstance(
-                    preprocess._column,
-                    (
-                        cudf.core.column.TimeDeltaColumn,
-                        cudf.core.column.DatetimeColumn,
-                    ),
-                )
+                if preprocess.dtype.kind in "mM"
                 else str(cudf.NA)
             )
             output = repr(
                 preprocess.astype("str").fillna(fill_value).to_pandas()
             )
-        elif isinstance(
-            preprocess._column, cudf.core.column.CategoricalColumn
-        ):
+        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
                     .to_pandas()
@@ -1461,13 +1434,13 @@ def __repr__(self):
             output = repr(preprocess.to_pandas())
 
         lines = output.split("\n")
-        if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             category_memory = lines[-1]
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 category_memory = category_memory.replace("'", "").split(": ")
                 category_memory = (
                     category_memory[0].replace(
-                        "object", preprocess._column.categories.dtype.name
+                        "object", preprocess.dtype.categories.dtype.name
                     )
                     + ": "
                     + category_memory[1]
@@ -1533,7 +1506,7 @@ def _make_operands_and_index_for_binop(
             can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs._index, can_use_self_column_name
+        return operands, lhs.index, can_use_self_column_name
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
@@ -1944,7 +1917,7 @@ def between(self, left, right, inclusive="both") -> Series:
                 "Inclusive has to be either string of 'both', "
                 "'left', 'right', or 'neither'."
             )
-        return self._from_data({self.name: lmask & rmask}, self._index)
+        return self._from_data({self.name: lmask & rmask}, self.index)
 
     @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
@@ -3146,7 +3119,7 @@ def value_counts(
                 # TODO: Remove this workaround once `observed`
                 # parameter support is added to `groupby`
                 res = res.reindex(self.dtype.categories).fillna(0)
-                res._index = res._index.astype(self.dtype)
+                res.index = res.index.astype(self.dtype)
 
         res.index.name = self.name
 
@@ -3654,7 +3627,9 @@ def pct_change(
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
-            self._from_data_like_self({self.name: result_col}),
+            self._from_data_like_self(
+                self._data._from_columns_like_self([result_col])
+            ),
             inplace=inplace,
         )
 
@@ -3952,7 +3927,7 @@ def microsecond(self):
                 * cudf.Scalar(1000, dtype="int32")
             )
             + self.series._column.get_dt_field("microsecond"),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4186,7 +4161,7 @@ def is_leap_year(self):
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4220,7 +4195,7 @@ def quarter(self):
         )
         return Series._from_data(
             {None: res},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4324,7 +4299,7 @@ def days_in_month(self):
         res = libcudf.datetime.days_in_month(self.series._column)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4370,7 +4345,7 @@ def is_month_end(self):
         last_day = libcudf.datetime.last_day_of_month(self.series._column)
         last_day = Series._from_data(
             ColumnAccessor({None: last_day}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
         return (self.day == last_day.dt.day).fillna(False)
@@ -4420,7 +4395,7 @@ def is_quarter_start(self):
         result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4471,7 +4446,7 @@ def is_quarter_end(self):
         result = ((day == last_day) & last_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4506,7 +4481,7 @@ def is_year_start(self):
         ) == cudf.Scalar(1)
         return Series._from_data(
             {None: outcol.fillna(False)},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4545,7 +4520,7 @@ def is_year_end(self):
         result = result.fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4553,7 +4528,7 @@ def is_year_end(self):
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
     @_cudf_nvtx_annotate
@@ -4590,7 +4565,7 @@ def ceil(self, freq):
         out_column = self.series._column.ceil(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4627,7 +4602,7 @@ def floor(self, freq):
         out_column = self.series._column.floor(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4667,7 +4642,7 @@ def round(self, freq):
         out_column = self.series._column.round(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4749,26 +4724,26 @@ def strftime(self, date_format, *args, **kwargs):
             dtype="str", format=date_format
         )
         return Series(
-            data=str_col, index=self.series._index, name=self.series.name
+            data=str_col, index=self.series.index, name=self.series.name
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self.series._column)
-        else:
-            result_col = localize(
-                self.series._column, tz, ambiguous, nonexistent
-            )
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        result_col = self.series._column.tz_localize(
+            tz, ambiguous, nonexistent
+        )
         return Series._from_data(
             data={self.series.name: result_col},
-            index=self.series._index,
+            index=self.series.index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Parameters
         ----------
@@ -4778,14 +4753,9 @@ def tz_convert(self, tz):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self.series._column._utc_time
-        else:
-            result_col = convert(self.series._column, tz)
+        result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
-            {self.series.name: result_col}, index=self.series._index
+            {self.series.name: result_col}, index=self.series.index
         )
 
 
@@ -5023,13 +4993,13 @@ def components(self):
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """  # noqa: E501
-        return self.series._column.components(index=self.series._index)
+        return self.series._column.components(index=self.series.index)
 
     @_cudf_nvtx_annotate
     def _get_td_field(self, field):
         out_column = getattr(self.series._column, field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 19dde2e51b9..d864b563208 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -77,7 +77,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -103,11 +103,6 @@ def _num_columns(self):
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter  # type: ignore
-    @_cudf_nvtx_annotate
-    def _column(self, value):
-        self._data[self.name] = value
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values(self):  # noqa: D102
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 65f97c99934..12a1ecc68e0 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -55,21 +55,6 @@
     "D": "datetime64[s]",
 }
 
-_offset_alias_to_code = {
-    "W": "W",
-    "D": "D",
-    "H": "h",
-    "h": "h",
-    "T": "m",
-    "min": "m",
-    "s": "s",
-    "S": "s",
-    "U": "us",
-    "us": "us",
-    "N": "ns",
-    "ns": "ns",
-}
-
 
 def to_datetime(
     arg,
@@ -332,9 +317,6 @@ def _process_col(
     format: Optional[str],
     utc: bool,
 ):
-    # Causes circular import
-    from cudf.core._internals.timezones import localize
-
     if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
@@ -411,7 +393,7 @@ def _process_col(
             f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
         )
     if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
-        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+        return col.tz_localize("UTC")
     return col
 
 
@@ -799,9 +781,11 @@ def date_range(
     periods=None,
     freq=None,
     tz=None,
-    normalize=False,
+    normalize: bool = False,
     name=None,
-    closed=None,
+    closed: Literal["left", "right", "both", "neither"] = "both",
+    *,
+    unit: Optional[str] = None,
 ):
     """Return a fixed frequency DatetimeIndex.
 
@@ -837,8 +821,13 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {None, 'left', 'right'}, optional
-        Not Supported
+    closed : {"left", "right", "both", "neither"}, default "both"
+        Whether to set each bound as closed or open.
+        Currently only "both" is supported
+
+    unit : str, default None
+        Specify the desired resolution of the result. Currently
+        not supported.
 
     Returns
     -------
@@ -875,11 +864,15 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if tz is not None:
-        raise NotImplementedError("tz is currently unsupported.")
+    if closed != "both":
+        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if unit is not None:
+        raise NotImplementedError(f"{unit=} is currently unsupported.")
+    if normalize is not False:
+        raise NotImplementedError(f"{normalize=} is currently unsupported.")
 
-    if closed is not None:
-        raise NotImplementedError("closed is currently unsupported.")
+    if freq is None and any(arg is None for arg in (start, end, periods)):
+        freq = "D"
 
     if (start, end, periods, freq).count(None) > 1:
         raise ValueError(
@@ -894,7 +887,7 @@ def date_range(
             FutureWarning,
         )
 
-    dtype = np.dtype("<M8[ns]")
+    dtype = np.dtype("datetime64[ns]")
 
     if freq is None:
         # `start`, `end`, `periods` is specified, we treat the timestamps as
@@ -903,7 +896,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result})
+        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -912,8 +905,8 @@ def date_range(
         offset = freq
     elif isinstance(freq, str):
         offset = pd.tseries.frequencies.to_offset(freq)
-        if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance(
-            offset, pd.tseries.offsets.Week
+        if not isinstance(
+            offset, (pd.tseries.offsets.Tick, pd.tseries.offsets.Week)
         ):
             raise ValueError(
                 f"Unrecognized frequency string {freq}. cuDF does "
@@ -923,7 +916,7 @@ def date_range(
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 
-    if _has_mixed_freqeuency(offset):
+    if _has_fixed_frequency(offset) and _has_non_fixed_frequency(offset):
         raise NotImplementedError(
             "Mixing fixed and non-fixed frequency offset is unsupported."
         )
@@ -1001,7 +994,9 @@ def date_range(
         arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
         res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq)
+    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
+        tz
+    )
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
@@ -1026,14 +1021,6 @@ def _has_non_fixed_frequency(freq: DateOffset) -> bool:
     return len(freq.kwds.keys() & non_fixed_frequencies) > 0
 
 
-def _has_mixed_freqeuency(freq: DateOffset) -> bool:
-    """Utility to determine if `freq` contains mixed fixed and non-fixed
-    frequency offset. e.g. {months=1, days=5}
-    """
-
-    return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq)
-
-
 def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
     """Given a DateOffset, which can consist of either fixed frequency or
     non-fixed frequency offset, convert to the smallest possible fixed
@@ -1071,8 +1058,7 @@ def _to_iso_calendar(arg):
         )
     if isinstance(arg, cudf.Index):
         iso_params = [
-            arg._column.as_string_column(arg._values.dtype, fmt)
-            for fmt in formats
+            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
         ]
         index = arg._column
     elif isinstance(arg.series, cudf.Series):
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index b2f3fd09146..03d07fc3a50 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -26,6 +26,7 @@ def read_json(
     keep_quotes=False,
     storage_options=None,
     mixed_types_as_string=False,
+    prune_columns=False,
     *args,
     **kwargs,
 ):
@@ -38,25 +39,6 @@ def read_json(
             f"or a bool, or None. Got {type(dtype)}"
         )
 
-    if engine == "cudf_experimental":
-        raise ValueError(
-            "engine='cudf_experimental' support has been removed, "
-            "use `engine='cudf'`"
-        )
-
-    if engine == "cudf_legacy":
-        # TODO: Deprecated in 23.02, please
-        # give some time until(more than couple of
-        # releases from now) `cudf_legacy`
-        # support can be removed completely.
-        warnings.warn(
-            "engine='cudf_legacy' is a deprecated engine."
-            "This will be removed in a future release."
-            "Please switch to using engine='cudf'.",
-            FutureWarning,
-        )
-    if engine == "cudf_legacy" and not lines:
-        raise ValueError(f"{engine} engine only supports JSON Lines format")
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine != "cudf" and keep_quotes:
@@ -64,7 +46,7 @@ def read_json(
             "keep_quotes='True' is supported only with engine='cudf'"
         )
 
-    if engine == "cudf_legacy" or engine == "cudf":
+    if engine == "cudf":
         if dtype is None:
             dtype = True
 
@@ -117,9 +99,10 @@ def read_json(
             lines,
             compression,
             byte_range,
-            engine == "cudf_legacy",
+            False,
             keep_quotes,
             mixed_types_as_string,
+            prune_columns,
         )
     else:
         warnings.warn(
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index d135a31438e..7082a85237a 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -10,9 +10,6 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
-from cudf.utils.metadata import (  # type: ignore
-    orc_column_statistics_pb2 as cs_pb2,
-)
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -173,45 +170,38 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
+        path_or_buf, _ = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source, compression=None, **kwargs
         )
-        if compression is not None:
-            ValueError("URL content-encoding decompression is not supported")
-
-        # Read in statistics and unpack
         (
             column_names,
-            raw_file_statistics,
-            raw_stripes_statistics,
-        ) = liborc.read_raw_orc_statistics(path_or_buf)
+            parsed_file_statistics,
+            parsed_stripes_statistics,
+        ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
         # Parse column names
         column_names = [
             column_name.decode("utf-8") for column_name in column_names
         ]
 
-        # Parse statistics
-        cs = cs_pb2.ColumnStatistics()
-
+        # Parse file statistics
         file_statistics = {
-            column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-            for i, raw_file_stats in enumerate(raw_file_statistics)
-            if columns is None or column_names[i] in columns
+            column_name: column_stats
+            for column_name, column_stats in zip(
+                column_names, parsed_file_statistics
+            )
+            if columns is None or column_name in columns
         }
-        if any(
-            not parsed_statistics
-            for parsed_statistics in file_statistics.values()
-        ):
-            continue
-        else:
-            files_statistics.append(file_statistics)
+        files_statistics.append(file_statistics)
 
-        for raw_stripe_statistics in raw_stripes_statistics:
+        # Parse stripe statistics
+        for parsed_stripe_statistics in parsed_stripes_statistics:
             stripe_statistics = {
-                column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-                for i, raw_file_stats in enumerate(raw_stripe_statistics)
-                if columns is None or column_names[i] in columns
+                column_name: column_stats
+                for column_name, column_stats in zip(
+                    column_names, parsed_stripe_statistics
+                )
+                if columns is None or column_name in columns
             }
             if any(
                 not parsed_statistics
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index bead9c352ef..dbdb2093b72 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -63,11 +63,16 @@ def _write_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     partitions_info=None,
     storage_options=None,
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -96,10 +101,15 @@ def _write_parquet(
         "row_group_size_rows": row_group_size_rows,
         "max_page_size_bytes": max_page_size_bytes,
         "max_page_size_rows": max_page_size_rows,
+        "max_dictionary_size": max_dictionary_size,
         "partitions_info": partitions_info,
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
         "use_dictionary": use_dictionary,
+        "skip_compression": skip_compression,
+        "column_encoding": column_encoding,
+        "column_type_length": column_type_length,
+        "output_as_binary": output_as_binary,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -138,6 +148,12 @@ def write_to_dataset(
     max_page_size_rows=None,
     storage_options=None,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -202,6 +218,30 @@ def write_to_dataset(
         If True, writes all columns as `null` in schema.
         If False, columns are written as `null` if they contain null values,
         otherwise as `not null`.
+    header_version : {{'1.0', '2.0'}}, default "1.0"
+        Controls whether to use version 1.0 or version 2.0 page headers when
+        encoding. Version 1.0 is more portable, but version 2.0 enables the
+        use of newer encoding schemes.
+    force_nullable_schema : bool, default False.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
+    skip_compression : set, optional, default None
+        If a column name is present in the set, that column will not be compressed,
+        regardless of the ``compression`` setting.
+    column_encoding : dict, optional, default None
+        Sets the page encoding to use on a per-column basis. The key is a column
+        name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+        'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+        'USE_DEFAULT'.
+    column_type_length : dict, optional, default None
+        Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+        The key is a column name and the value is an integer. The named column
+        will be output as unannotated binary (i.e. the column will behave as if
+        ``output_as_binary`` was set).
+    output_as_binary : set, optional, default None
+        If a column name is present in the set, that column will be output as
+        unannotated binary, rather than the default 'UTF-8'.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -239,6 +279,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
@@ -260,6 +306,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     return metadata
@@ -267,17 +319,45 @@ def write_to_dataset(
 
 @ioutils.doc_read_parquet_metadata()
 @_cudf_nvtx_annotate
-def read_parquet_metadata(path):
+def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
-    import pyarrow.parquet as pq
+    # Multiple sources are passed as a list. If a single source is passed,
+    # wrap it in a list for unified processing downstream.
+    if not is_list_like(filepath_or_buffer):
+        filepath_or_buffer = [filepath_or_buffer]
 
-    pq_file = pq.ParquetFile(path)
+    # Start by trying to construct a filesystem object
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=None
+    )
 
-    num_rows = pq_file.metadata.num_rows
-    num_row_groups = pq_file.num_row_groups
-    col_names = pq_file.schema.names
+    # Check if filepath or buffer
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
-    return num_rows, num_row_groups, col_names
+    # List of filepaths or buffers
+    filepaths_or_buffers = []
+
+    for source in filepath_or_buffer:
+        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
+            path_or_data=source,
+            compression=None,
+            fs=fs,
+            use_python_file_object=True,
+            open_file_options=None,
+            storage_options=None,
+            bytes_per_thread=None,
+        )
+
+        if compression is not None:
+            raise ValueError(
+                "URL content-encoding decompression is not supported"
+            )
+        if isinstance(tmp_source, list):
+            filepath_or_buffer.extend(tmp_source)
+        else:
+            filepaths_or_buffers.append(tmp_source)
+
+    return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
 @_cudf_nvtx_annotate
@@ -870,11 +950,16 @@ def to_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     storage_options=None,
     return_metadata=False,
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
     *args,
     **kwargs,
 ):
@@ -924,6 +1009,12 @@ def to_parquet(
                 return_metadata=return_metadata,
                 storage_options=storage_options,
                 force_nullable_schema=force_nullable_schema,
+                header_version=header_version,
+                use_dictionary=use_dictionary,
+                skip_compression=skip_compression,
+                column_encoding=column_encoding,
+                column_type_length=column_type_length,
+                output_as_binary=output_as_binary,
             )
 
         partition_info = (
@@ -946,11 +1037,16 @@ def to_parquet(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            max_dictionary_size=max_dictionary_size,
             partitions_info=partition_info,
             storage_options=storage_options,
             force_nullable_schema=force_nullable_schema,
             header_version=header_version,
             use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
@@ -965,15 +1061,10 @@ def to_parquet(
         if index is None:
             index = True
 
-        # Convert partition_file_name to a call back
-        if partition_file_name:
-            partition_file_name = lambda x: partition_file_name  # noqa: E731
-
         pa_table = df.to_arrow(preserve_index=index)
         return pq.write_to_dataset(
             pa_table,
             root_path=path,
-            partition_filename_cb=partition_file_name,
             partition_cols=partition_cols,
             *args,
             **kwargs,
@@ -1220,9 +1311,9 @@ def __init__(
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
-            self.dir_: Optional[
-                tempfile.TemporaryDirectory
-            ] = tempfile.TemporaryDirectory()
+            self.dir_: Optional[tempfile.TemporaryDirectory] = (
+                tempfile.TemporaryDirectory()
+            )
             self.path = self.dir_.name
         else:
             self.fs_meta = {}
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 7a0db49bd20..efa8eabd8b8 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 import textwrap
@@ -152,11 +152,6 @@ def _validator(val):
 
 
 def _cow_validator(val):
-    if get_option("spill") and val:
-        raise ValueError(
-            "Copy-on-write is not supported when spilling is enabled. "
-            "Please set `spill` to `False`"
-        )
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
@@ -164,14 +159,6 @@ def _cow_validator(val):
 
 
 def _spill_validator(val):
-    try:
-        if get_option("copy_on_write") and val:
-            raise ValueError(
-                "Spilling is not supported when copy-on-write is enabled. "
-                "Please set `copy_on_write` to `False`"
-            )
-    except KeyError:
-        pass
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py
index 1669882631b..468c5687c15 100644
--- a/python/cudf/cudf/pandas/_wrappers/common.py
+++ b/python/cudf/cudf/pandas/_wrappers/common.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -17,9 +17,9 @@ def array_method(self: _FastSlowProxy, *args, **kwargs):
 
 def array_function_method(self, func, types, args, kwargs):
     try:
-        return _FastSlowAttribute("__array_function__").__get__(self)(
-            func, types, args, kwargs
-        )
+        return _FastSlowAttribute("__array_function__").__get__(
+            self, type(self)
+        )(func, types, args, kwargs)
     except Exception:
         # if something went wrong with __array_function__ we
         # attempt to call the function directly on the slow
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 9955550ef90..94298872213 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
 import numpy.core.multiarray
 
 from ..fast_slow_proxy import (
+    _FastSlowAttribute,
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
@@ -122,6 +123,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         "__iter__": custom_iter,
         # Special wrapping to handle scalar values
         "_fsproxy_wrap": classmethod(wrap_ndarray),
+        "base": _FastSlowAttribute("base", private=True),
     },
 )
 
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index b7c8e92e8db..2e3880e14f6 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -7,6 +7,21 @@
 import sys
 
 import pandas as pd
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar as pd_AbstractHolidayCalendar,
+    EasterMonday as pd_EasterMonday,
+    GoodFriday as pd_GoodFriday,
+    Holiday as pd_Holiday,
+    HolidayCalendarFactory as pd_HolidayCalendarFactory,
+    HolidayCalendarMetaClass as pd_HolidayCalendarMetaClass,
+    USColumbusDay as pd_USColumbusDay,
+    USFederalHolidayCalendar as pd_USFederalHolidayCalendar,
+    USLaborDay as pd_USLaborDay,
+    USMartinLutherKingJr as pd_USMartinLutherKingJr,
+    USMemorialDay as pd_USMemorialDay,
+    USPresidentsDay as pd_USPresidentsDay,
+    USThanksgivingDay as pd_USThanksgivingDay,
+)
 
 import cudf
 
@@ -37,7 +52,6 @@
     XportReader as pd_XportReader,
 )
 
-
 # TODO(pandas2.1): Can import from pandas.api.typing
 from pandas.core.resample import (  # isort: skip
     Resampler as pd_Resampler,
@@ -93,16 +107,61 @@ class _AccessorAttr:
     """
 
     def __init__(self, typ):
-        self.__typ = typ
+        self._typ = typ
+
+    def __set_name__(self, owner, name):
+        self._name = name
 
     def __get__(self, obj, cls=None):
         if obj is None:
-            return self.__typ
+            return self._typ
         else:
-            # allow __getattr__ to handle this
-            raise AttributeError()
+            return _FastSlowAttribute(self._name).__get__(obj, type(obj))
+
+
+def Timestamp_Timedelta__new__(cls, *args, **kwargs):
+    # Call fast/slow constructor
+    # This takes care of running __init__ as well, but must be paired
+    # with a removal of the defaulted __init__ that
+    # make_final_proxy_type provides.
+    # Timestamp & Timedelta don't always return same types as self,
+    # hence this method is needed.
+    self, _ = _fast_slow_function_call(
+        lambda cls, args, kwargs: cls(*args, **kwargs),
+        cls,
+        args,
+        kwargs,
+    )
+    return self
 
 
+Timedelta = make_final_proxy_type(
+    "Timedelta",
+    _Unusable,
+    pd.Timedelta,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
+
+Timestamp = make_final_proxy_type(
+    "Timestamp",
+    _Unusable,
+    pd.Timestamp,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
 DatetimeProperties = make_intermediate_proxy_type(
     "DatetimeProperties",
     cudf.core.series.DatetimeProperties,
@@ -157,6 +216,7 @@ def _DataFrame__dir__(self):
         "__dir__": _DataFrame__dir__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
+        "_accessors": set(),
     },
 )
 
@@ -174,11 +234,12 @@ def _DataFrame__dir__(self):
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__iter__": custom_iter,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
+        "_accessors": set(),
     },
 )
 
@@ -208,7 +269,7 @@ def Index__new__(cls, *args, **kwargs):
         "__array_function__": array_function_method,
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "__iter__": custom_iter,
@@ -216,6 +277,9 @@ def Index__new__(cls, *args, **kwargs):
         "__new__": Index__new__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_accessors": set(),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -280,7 +344,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeArray = make_final_proxy_type(
@@ -289,6 +357,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.DatetimeArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeTZDtype = make_final_proxy_type(
@@ -307,15 +379,53 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
+try:
+    from pandas.arrays import NumpyExtensionArray as pd_NumpyExtensionArray
+
+    NumpyExtensionArray = make_final_proxy_type(
+        "NumpyExtensionArray",
+        _Unusable,
+        pd_NumpyExtensionArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
+
+except ImportError:
+    from pandas.arrays import PandasArray as pd_PandasArray
+
+    PandasArray = make_final_proxy_type(
+        "PandasArray",
+        _Unusable,
+        pd_PandasArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
+
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
     _Unusable,
     pd.arrays.TimedeltaArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodIndex = make_final_proxy_type(
@@ -325,7 +435,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodArray = make_final_proxy_type(
@@ -334,6 +448,11 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.PeriodArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+    },
 )
 
 PeriodDtype = make_final_proxy_type(
@@ -395,6 +514,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.StringArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 StringDtype = make_final_proxy_type(
@@ -403,7 +526,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.StringDtype,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
-    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "storage": _FastSlowAttribute("storage"),
+    },
 )
 
 BooleanArray = make_final_proxy_type(
@@ -413,7 +539,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
     },
 )
 
@@ -433,7 +561,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -517,7 +647,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalArray = make_final_proxy_type(
@@ -526,6 +660,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.IntervalArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalDtype = make_final_proxy_type(
@@ -553,7 +691,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -729,6 +869,14 @@ def Index__new__(cls, *args, **kwargs):
         pd_Styler,
         fast_to_slow=_Unusable(),
         slow_to_fast=_Unusable(),
+        additional_attributes={
+            "css": _FastSlowAttribute("css"),
+            "ctx": _FastSlowAttribute("ctx"),
+            "index": _FastSlowAttribute("ctx"),
+            "data": _FastSlowAttribute("data"),
+            "_display_funcs": _FastSlowAttribute("_display_funcs"),
+            "table_styles": _FastSlowAttribute("table_styles"),
+        },
     )
 except ImportError:
     # Styler requires Jinja to be installed
@@ -744,7 +892,7 @@ def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     return local_dict, global_dict
 
 
-@register_proxy_func(pd.eval)
+@register_proxy_func(pd.core.computation.eval.eval)
 @nvtx.annotate(
     "CUDF_PANDAS_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -774,6 +922,24 @@ def _eval(
     )
 
 
+_orig_df_eval_method = DataFrame.eval
+
+
+@register_proxy_func(pd.core.accessor.register_dataframe_accessor)
+def _register_dataframe_accessor(name):
+    return pd.core.accessor._register_accessor(name, DataFrame)
+
+
+@register_proxy_func(pd.core.accessor.register_series_accessor)
+def _register_series_accessor(name):
+    return pd.core.accessor._register_accessor(name, Series)
+
+
+@register_proxy_func(pd.core.accessor.register_index_accessor)
+def _register_index_accessor(name):
+    return pd.core.accessor._register_accessor(name, Index)
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -784,11 +950,14 @@ def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("eval")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_df_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
+_orig_query_eval_method = DataFrame.query
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_QUERY",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -801,8 +970,8 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("query")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_query_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
@@ -827,6 +996,125 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader
 )
 
+USFederalHolidayCalendar = make_final_proxy_type(
+    "USFederalHolidayCalendar",
+    _Unusable,
+    pd_USFederalHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+HolidayCalendarMetaClass = make_final_proxy_type(
+    "HolidayCalendarMetaClass",
+    _Unusable,
+    pd_HolidayCalendarMetaClass,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+@register_proxy_func(pd_HolidayCalendarFactory)
+def holiday_calendar_factory_wrapper(*args, **kwargs):
+    # Call the original HolidayCalendarFactory
+    result = _FunctionProxy(_Unusable(), pd_HolidayCalendarFactory)(
+        *args, **kwargs
+    )
+    # Return the slow proxy of the result
+    return result._fsproxy_slow
+
+
+AbstractHolidayCalendar = make_final_proxy_type(
+    "AbstractHolidayCalendar",
+    _Unusable,
+    pd_AbstractHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    meta_class=pd_HolidayCalendarMetaClass,
+)
+
+Holiday = make_final_proxy_type(
+    "Holiday",
+    _Unusable,
+    pd_Holiday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+USThanksgivingDay = make_final_proxy_type(
+    "USThanksgivingDay",
+    _Unusable,
+    pd_USThanksgivingDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USColumbusDay = make_final_proxy_type(
+    "USColumbusDay",
+    _Unusable,
+    pd_USColumbusDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USLaborDay = make_final_proxy_type(
+    "USLaborDay",
+    _Unusable,
+    pd_USLaborDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMemorialDay = make_final_proxy_type(
+    "USMemorialDay",
+    _Unusable,
+    pd_USMemorialDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMartinLutherKingJr = make_final_proxy_type(
+    "USMartinLutherKingJr",
+    _Unusable,
+    pd_USMartinLutherKingJr,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USPresidentsDay = make_final_proxy_type(
+    "USPresidentsDay",
+    _Unusable,
+    pd_USPresidentsDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+GoodFriday = make_final_proxy_type(
+    "GoodFriday",
+    _Unusable,
+    pd_GoodFriday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+EasterMonday = make_final_proxy_type(
+    "EasterMonday",
+    _Unusable,
+    pd_EasterMonday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
 
 FY5253 = make_final_proxy_type(
     "FY5253",
@@ -1089,6 +1377,7 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
 MonthBegin = make_final_proxy_type(
     "MonthBegin",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index e811ba1351a..94caec1ce6c 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -83,6 +83,9 @@ def __getattribute__(self, name: str) -> Any:
             return super().__getattribute__(name)
         raise TypeError("Unusable type. Falling back to the slow object")
 
+    def __repr__(self) -> str:
+        raise AttributeError("Unusable type. Falling back to the slow object")
+
 
 class _PickleConstructor:
     """A pickleable object to support construction in __reduce__.
@@ -103,6 +106,19 @@ def __call__(self):
 _DELETE = object()
 
 
+def create_composite_metaclass(base_meta, additional_meta):
+    """
+    Dynamically creates a composite metaclass that inherits from both provided metaclasses.
+    This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved.
+    """
+
+    class CompositeMeta(base_meta, additional_meta):
+        def __new__(cls, name, bases, namespace):
+            return super().__new__(cls, name, bases, namespace)
+
+    return CompositeMeta
+
+
 def make_final_proxy_type(
     name: str,
     fast_type: type,
@@ -114,6 +130,7 @@ def make_final_proxy_type(
     additional_attributes: Mapping[str, Any] | None = None,
     postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None,
     bases: Tuple = (),
+    meta_class=None,
 ) -> Type[_FinalProxy]:
     """
     Defines a fast-slow proxy type for a pair of "final" fast and slow
@@ -217,10 +234,22 @@ def _fsproxy_state(self) -> _State:
         elif v is not _DELETE:
             cls_dict[k] = v
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+    if meta_class is None:
+        meta_class = _FastSlowProxyMeta
+    else:
+        meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class)
+
     cls = types.new_class(
         name,
         (*bases, _FinalProxy),
-        {"metaclass": _FastSlowProxyMeta},
+        {"metaclass": meta_class},
         lambda ns: ns.update(cls_dict),
     )
     functools.update_wrapper(
@@ -310,11 +339,26 @@ def _fsproxy_fast_to_slow(self):
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
         "_fsproxy_state": _fsproxy_state,
     }
-
     for method in _SPECIAL_METHODS:
         if getattr(slow_type, method, False):
             cls_dict[method] = _FastSlowAttribute(method)
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
+    for slow_name in getattr(slow_type, "_attributes", []):
+        if slow_name in cls_dict:
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
     cls = types.new_class(
         name,
         (_IntermediateProxy,),
@@ -392,62 +436,16 @@ def _raise_attribute_error(obj, name):
     raise AttributeError(f"'{obj}' object has no attribute '{name}'")
 
 
-class _FastSlowAttribute:
-    """
-    A descriptor type used to define attributes of fast-slow proxies.
-    """
-
-    def __init__(self, name: str):
-        self._name = name
-
-    def __get__(self, obj, owner=None) -> Any:
-        if obj is None:
-            # class attribute
-            obj = owner
-
-        if not (
-            isinstance(obj, _FastSlowProxy)
-            or issubclass(type(obj), _FastSlowProxyMeta)
-        ):
-            # we only want to look up attributes on the underlying
-            # fast/slow objects for instances of _FastSlowProxy or
-            # subtypes of _FastSlowProxyMeta:
-            _raise_attribute_error(owner if owner else obj, self._name)
-
-        result, _ = _fast_slow_function_call(getattr, obj, self._name)
-
-        if isinstance(result, functools.cached_property):
-            # TODO: temporary workaround until dask is able
-            # to correctly inspect cached_property objects.
-            # GH: 264
-            result = property(result.func)
-
-        if isinstance(result, (_MethodProxy, property)):
-            from .module_accelerator import disable_module_accelerator
-
-            type_ = owner if owner else type(obj)
-            slow_result_type = getattr(type_._fsproxy_slow, self._name)
-            with disable_module_accelerator():
-                result.__doc__ = inspect.getdoc(  # type: ignore
-                    slow_result_type
-                )
-
-            if isinstance(result, _MethodProxy):
-                # Note that this will produce the wrong result for bound
-                # methods because dir for the method won't be the same as for
-                # the pure unbound function, but the alternative is
-                # materializing the slow object when we don't really want to.
-                result._fsproxy_slow_dir = dir(slow_result_type)  # type: ignore
-
-        return result
-
-
 class _FastSlowProxyMeta(type):
     """
     Metaclass used to dynamically find class attributes and
     classmethods of fast-slow proxy types.
     """
 
+    _fsproxy_slow_dir: list
+    _fsproxy_slow_type: type
+    _fsproxy_fast_type: type
+
     @property
     def _fsproxy_slow(self) -> type:
         return self._fsproxy_slow_type
@@ -464,15 +462,6 @@ def __dir__(self):
         except AttributeError:
             return type.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy") or name.startswith("__"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(None, owner=self)
-
     def __subclasscheck__(self, __subclass: type) -> bool:
         if super().__subclasscheck__(__subclass):
             return True
@@ -546,140 +535,13 @@ def __dir__(self):
         except AttributeError:
             return object.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name in {
-            "_ipython_canary_method_should_not_exist_",
-            "_ipython_display_",
-            "_repr_mimebundle_",
-            # Workaround for https://github.com/numpy/numpy/issues/5350
-            # see GH:216 for details
-            "__array_struct__",
-        }:
-            # IPython always looks for these names in its display
-            # logic. See #GH:70 and #GH:172 for more details but the
-            # gist is that not raising an AttributeError immediately
-            # results in slow display in IPython (since the fast
-            # object will be copied to the slow one to look for
-            # attributes there which then also won't exist).
-            # This is somewhat delicate to the order in which IPython
-            # implements special display fallbacks.
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name.startswith("_"):
-            # private attributes always come from `._fsproxy_slow`:
-            obj = getattr(self._fsproxy_slow, name)
-            if name.startswith("__array"):
-                # TODO: numpy methods raise when given proxy ndarray objects
-                # https://numpy.org/doc/stable/reference/arrays.classes.html#special-attributes-and-methods  # noqa:E501
-                return obj
-
-            if not _is_function_or_method(obj):
-                return _maybe_wrap_result(
-                    obj, getattr, self._fsproxy_slow, name
-                )
-
-            @functools.wraps(obj)
-            def _wrapped_private_slow(*args, **kwargs):
-                slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
-                result = obj(*slow_args, **slow_kwargs)
-                return _maybe_wrap_result(result, obj, *args, **kwargs)
-
-            return _wrapped_private_slow
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(self)
-
     def __setattr__(self, name, value):
         if name.startswith("_"):
             object.__setattr__(self, name, value)
             return
-        return _FastSlowAttribute("__setattr__").__get__(self)(name, value)
-
-    def __add__(self, other):
-        return _fast_slow_function_call(operator.add, self, other)[0]
-
-    def __radd__(self, other):
-        return _fast_slow_function_call(operator.add, other, self)[0]
-
-    def __sub__(self, other):
-        return _fast_slow_function_call(operator.sub, self, other)[0]
-
-    def __rsub__(self, other):
-        return _fast_slow_function_call(operator.sub, other, self)[0]
-
-    def __mul__(self, other):
-        return _fast_slow_function_call(operator.mul, self, other)[0]
-
-    def __rmul__(self, other):
-        return _fast_slow_function_call(operator.mul, other, self)[0]
-
-    def __truediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, self, other)[0]
-
-    def __rtruediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, other, self)[0]
-
-    def __floordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, self, other)[0]
-
-    def __rfloordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, other, self)[0]
-
-    def __mod__(self, other):
-        return _fast_slow_function_call(operator.mod, self, other)[0]
-
-    def __rmod__(self, other):
-        return _fast_slow_function_call(operator.mod, other, self)[0]
-
-    def __divmod__(self, other):
-        return _fast_slow_function_call(divmod, self, other)[0]
-
-    def __rdivmod__(self, other):
-        return _fast_slow_function_call(divmod, other, self)[0]
-
-    def __pow__(self, other):
-        return _fast_slow_function_call(operator.pow, self, other)[0]
-
-    def __rpow__(self, other):
-        return _fast_slow_function_call(operator.pow, other, self)[0]
-
-    def __lshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, self, other)[0]
-
-    def __rlshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, other, self)[0]
-
-    def __rshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, self, other)[0]
-
-    def __rrshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, other, self)[0]
-
-    def __and__(self, other):
-        return _fast_slow_function_call(operator.and_, self, other)[0]
-
-    def __rand__(self, other):
-        return _fast_slow_function_call(operator.and_, other, self)[0]
-
-    def __xor__(self, other):
-        return _fast_slow_function_call(operator.xor, self, other)[0]
-
-    def __rxor__(self, other):
-        return _fast_slow_function_call(operator.xor, other, self)[0]
-
-    def __or__(self, other):
-        return _fast_slow_function_call(operator.or_, self, other)[0]
-
-    def __ror__(self, other):
-        return _fast_slow_function_call(operator.or_, other, self)[0]
-
-    def __matmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, self, other)[0]
-
-    def __rmatmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, other, self)[0]
+        return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+            name, value
+        )
 
 
 class _FinalProxy(_FastSlowProxy):
@@ -855,17 +717,162 @@ class _FunctionProxy(_CallableProxyMixin):
 
     __name__: str
 
-    def __init__(self, fast: Callable | _Unusable, slow: Callable):
+    def __init__(
+        self,
+        fast: Callable | _Unusable,
+        slow: Callable,
+        *,
+        assigned=None,
+        updated=None,
+    ):
         self._fsproxy_fast = fast
         self._fsproxy_slow = slow
-        functools.update_wrapper(self, slow)
+        if assigned is None:
+            assigned = functools.WRAPPER_ASSIGNMENTS
+        if updated is None:
+            updated = functools.WRAPPER_UPDATES
+        functools.update_wrapper(
+            self,
+            slow,
+            assigned=assigned,
+            updated=updated,
+        )
 
+    def __reduce__(self):
+        """
+        In conjunction with `__proxy_setstate__`, this effectively enables
+        proxy types to be pickled and unpickled by pickling and unpickling
+        the underlying wrapped types.
+        """
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
 
-class _MethodProxy(_CallableProxyMixin, _IntermediateProxy):
+        with disable_module_accelerator():
+            pickled_fast = pickle.dumps(self._fsproxy_fast)
+            pickled_slow = pickle.dumps(self._fsproxy_slow)
+        return (
+            _PickleConstructor(type(self)),
+            (),
+            (pickled_fast, pickled_slow),
+        )
+
+    def __setstate__(self, state):
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            unpickled_fast = pickle.loads(state[0])
+            unpickled_slow = pickle.loads(state[1])
+        self._fsproxy_fast = unpickled_fast
+        self._fsproxy_slow = unpickled_slow
+
+
+def is_bound_method(obj):
+    return inspect.ismethod(obj) and not inspect.isfunction(obj)
+
+
+def is_function(obj):
+    return inspect.isfunction(obj) or isinstance(obj, types.FunctionType)
+
+
+class _FastSlowAttribute:
     """
-    Methods of fast-slow proxies are of type _MethodProxy.
+    A descriptor type used to define attributes of fast-slow proxies.
     """
 
+    _attr: Any
+
+    def __init__(self, name: str, *, private: bool = False):
+        self._name = name
+        self._private = private
+        self._attr = None
+        self._doc = None
+        self._dir = None
+
+    def __get__(self, instance, owner) -> Any:
+        from .module_accelerator import disable_module_accelerator
+
+        if self._attr is None:
+            if self._private:
+                fast_attr = _Unusable()
+            else:
+                fast_attr = getattr(
+                    owner._fsproxy_fast, self._name, _Unusable()
+                )
+
+            try:
+                slow_attr = getattr(owner._fsproxy_slow, self._name)
+            except AttributeError as e:
+                if instance is not None:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                else:
+                    raise e
+
+            if _is_function_or_method(slow_attr):
+                self._attr = _MethodProxy(fast_attr, slow_attr)
+            else:
+                # for anything else, use a fast-slow attribute:
+                self._attr, _ = _fast_slow_function_call(
+                    getattr, owner, self._name
+                )
+
+                if isinstance(
+                    self._attr, (property, functools.cached_property)
+                ):
+                    with disable_module_accelerator():
+                        self._attr.__doc__ = inspect.getdoc(slow_attr)
+
+        if instance is not None:
+            if isinstance(self._attr, _MethodProxy):
+                if is_bound_method(self._attr._fsproxy_slow):
+                    return self._attr
+                else:
+                    return types.MethodType(self._attr, instance)
+            else:
+                if self._private:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                return _fast_slow_function_call(getattr, instance, self._name)[
+                    0
+                ]
+        return self._attr
+
+
+class _MethodProxy(_FunctionProxy):
+    def __init__(self, fast, slow):
+        super().__init__(
+            fast,
+            slow,
+            updated=functools.WRAPPER_UPDATES,
+            assigned=(
+                tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS))
+            ),
+        )
+
+    def __dir__(self):
+        return self._fsproxy_slow.__dir__()
+
+    @property
+    def __doc__(self):
+        return self._fsproxy_slow.__doc__
+
+    @property
+    def __name__(self):
+        return self._fsproxy_slow.__name__
+
+    @__name__.setter
+    def __name__(self, value):
+        try:
+            setattr(self._fsproxy_fast, "__name__", value)
+        except AttributeError:
+            pass
+        setattr(self._fsproxy_slow, "__name__", value)
+
 
 def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any:
     """
@@ -1046,10 +1053,6 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any:
             return type(result)(wrapped)
     elif isinstance(result, Iterator):
         return (_maybe_wrap_result(r, lambda x: x, r) for r in result)
-    elif _is_function_or_method(result):
-        return _MethodProxy._fsproxy_wrap(
-            result, method_chain=(func, args, kwargs)
-        )
     else:
         return result
 
@@ -1093,7 +1096,7 @@ def _replace_closurevars(
     f: types.FunctionType,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
     seen: Set[int],
-) -> types.FunctionType:
+) -> Callable[..., Any]:
     """
     Return a copy of `f` with its closure variables replaced with
     their corresponding slow (or fast) types.
@@ -1108,7 +1111,7 @@ def _replace_closurevars(
         if any(c == types.CellType() for c in f.__closure__):
             return f
 
-    f_nonlocals, f_globals, f_builtins, _ = inspect.getclosurevars(f)
+    f_nonlocals, f_globals, _, _ = inspect.getclosurevars(f)
 
     g_globals = _transform_arg(f_globals, attribute_name, seen)
     g_nonlocals = _transform_arg(f_nonlocals, attribute_name, seen)
@@ -1121,59 +1124,104 @@ def _replace_closurevars(
         return f
 
     g_closure = tuple(types.CellType(val) for val in g_nonlocals.values())
-    g_globals["__builtins__"] = f_builtins
+
+    # https://github.com/rapidsai/cudf/issues/15548
+    new_g_globals = f.__globals__.copy()
+    new_g_globals.update(g_globals)
 
     g = types.FunctionType(
         f.__code__,
-        g_globals,
+        new_g_globals,
         name=f.__name__,
         argdefs=f.__defaults__,
         closure=g_closure,
     )
-    g = functools.update_wrapper(
+    return functools.update_wrapper(
         g,
         f,
         assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
     )
-    return g
 
 
 _SPECIAL_METHODS: Set[str] = {
-    "__repr__",
-    "__str__",
-    "__len__",
-    "__contains__",
-    "__getitem__",
-    "__setitem__",
-    "__delitem__",
-    "__getslice__",
-    "__setslice__",
-    "__delslice__",
-    "__iter__",
-    "__lt__",
-    "__le__",
-    "__eq__",
-    "__ne__",
-    "__gt__",
-    "__ge__",
-    "__pos__",
-    "__neg__",
-    "__invert__",
     "__abs__",
-    "__round__",
-    "__format__",
+    "__add__",
+    "__and__",
     "__bool__",
-    "__float__",
-    "__int__",
+    "__call__",
+    "__getattr__",
     "__complex__",
-    "__enter__",
-    "__exit__",
-    "__next__",
+    "__contains__",
     "__copy__",
-    "__deepcopy__",
     "__dataframe__",
-    "__call__",
+    "__deepcopy__",
+    "__delitem__",
+    "__delslice__",
+    "__divmod__",
+    "__enter__",
+    "__eq__",
+    "__exit__",
+    "__float__",
+    "__floordiv__",
+    "__format__",
+    "__ge__",
+    "__getitem__",
+    "__getslice__",
+    "__gt__",
     # Added on a per-proxy basis
     # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428
     # "__hash__",
+    "__iadd__",
+    "__iand__",
+    "__iconcat__",
+    "__ifloordiv__",
+    "__ilshift__",
+    "__imatmul__",
+    "__imod__",
+    "__imul__",
+    "__int__",
+    "__invert__",
+    "__ior__",
+    "__ipow__",
+    "__irshift__",
+    "__isub__",
+    "__iter__",
+    "__itruediv__",
+    "__ixor__",
+    "__le__",
+    "__len__",
+    "__lshift__",
+    "__lt__",
+    "__matmul__",
+    "__mod__",
+    "__mul__",
+    "__ne__",
+    "__neg__",
+    "__next__",
+    "__or__",
+    "__pos__",
+    "__pow__",
+    "__radd__",
+    "__rand__",
+    "__rdivmod__",
+    "__repr__",
+    "__rfloordiv__",
+    "__rlshift__",
+    "__rmatmul__",
+    "__rmod__",
+    "__rmul__",
+    "__ror__",
+    "__round__",
+    "__rpow__",
+    "__rrshift__",
+    "__rshift__",
+    "__rsub__",
+    "__rtruediv__",
+    "__rxor__",
+    "__setitem__",
+    "__setslice__",
+    "__str__",
+    "__sub__",
+    "__truediv__",
+    "__xor__",
 }
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index e97d6e4af24..1d431c6d882 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, Dict, List, NamedTuple
+from typing import Any, ContextManager, Dict, NamedTuple, Tuple
 
 from typing_extensions import Self
 
@@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase):
     attempts to call the fast version first).
     """
 
-    _denylist: List[str]
+    _denylist: Tuple[str]
     _use_fast_lib: bool
     _use_fast_lib_lock: threading.RLock
     _module_cache_prefix: str = "_slow_lib_"
@@ -407,7 +407,7 @@ def __new__(
             if mod.startswith(self.slow_lib):
                 sys.modules[self._module_cache_prefix + mod] = sys.modules[mod]
                 del sys.modules[mod]
-        self._denylist = [*slow_module.__path__, *fast_module.__path__]
+        self._denylist = (*slow_module.__path__, *fast_module.__path__)
 
         # Lock to manage temporarily disabling delivering wrapped attributes
         self._use_fast_lib_lock = threading.RLock()
@@ -551,17 +551,13 @@ def getattr_real_or_wrapped(
             # release the lock after reading this value)
             use_real = not loader._use_fast_lib
         if not use_real:
-            CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
             # Only need to check the denylist if we're not turned off.
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
             calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
-            use_real = not calling_module.is_relative_to(
-                CUDF_PANDAS_PATH
-            ) and any(
-                calling_module.is_relative_to(path)
-                for path in loader._denylist
+            use_real = _caller_in_denylist(
+                calling_module, tuple(loader._denylist)
             )
         try:
             if use_real:
@@ -623,3 +619,13 @@ def disable_module_accelerator() -> contextlib.ExitStack:
                 stack.enter_context(finder.disabled())
         return stack.pop_all()
     assert False  # pacify type checker
+
+
+# because this function gets called so often and is quite
+# expensive to run, we cache the results:
+@functools.lru_cache(maxsize=1024)
+def _caller_in_denylist(calling_module, denylist):
+    CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
+    return not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any(
+        calling_module.is_relative_to(path) for path in denylist
+    )
diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py
index 0124d411e3b..0dbd333ce4f 100644
--- a/python/cudf/cudf/pandas/profiler.py
+++ b/python/cudf/cudf/pandas/profiler.py
@@ -127,12 +127,7 @@ def get_namespaced_function_name(
         ],
     ):
         if isinstance(func_obj, _MethodProxy):
-            # Extract classname from method object
-            type_name = type(func_obj._fsproxy_wrapped.__self__).__name__
-            # Explicitly ask for __name__ on _fsproxy_wrapped to avoid
-            # getting a private attribute and forcing a slow-path copy
-            func_name = func_obj._fsproxy_wrapped.__name__
-            return ".".join([type_name, func_name])
+            return func_obj._fsproxy_slow.__qualname__
         elif isinstance(func_obj, _FunctionProxy) or issubclass(
             func_obj, (_FinalProxy, _IntermediateProxy)
         ):
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index f1744c9e92b..8870fbc5c28 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -11,6 +11,7 @@
 Example:
     python analyze-test-failures.py log.json frame/*
 """
+
 import json
 import sys
 from collections import Counter
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 06df7b36f7d..cd9f90d50fe 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,18 +22,9 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/io/test_user_agent.py \
---ignore=tests/interchange/test_impl.py \
---ignore=tests/window/test_dtypes.py \
---ignore=tests/strings/test_api.py \
---ignore=tests/window/test_numba.py \
---ignore=tests/window \
---ignore=tests/io/pytables \
---ignore=tests/plotting \
---ignore=tests/scalar \
---ignore=tests/series/test_arithmetic.py \
---ignore=tests/tslibs/test_parsing.py \
---ignore=tests/io/parser/common/test_read_errors.py"
+# tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality)
+PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \
+--ignore=tests/io/test_clipboard.py"
 
 mkdir -p pandas-testing
 cd pandas-testing
@@ -60,8 +51,6 @@ if [ ! -d "pandas-tests" ]; then
 [tool.pytest.ini_options]
 xfail_strict = true
 filterwarnings = [
-  "error:Sparse:FutureWarning",
-  "error:The SparseArray:FutureWarning",
   # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
   "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
 ]
@@ -72,7 +61,7 @@ markers = [
   "db: tests requiring a database (mysql or postgres)",
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
-  "arraymanager: mark a test to run with ArrayManager enabled",
+  "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
     # append the contents of patch-confest.py to conftest.py
@@ -100,104 +89,57 @@ cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftes
 # Run the tests
 cd pandas-tests/
 
-# TODO: Get a postgres & mysql container set up on the CI
-# test_overwrite_warns unsafely patchs over Series.mean affecting other tests when run in parallel
-# test_complex_series_frame_alignment randomly selects a DataFrames and axis to test but particular random selection(s) always fails
-# test_numpy_ufuncs_basic compares floating point values to unbounded precision, sometimes leading to failures
-TEST_NUMPY_UFUNCS_BASIC_FLAKY="not test_numpy_ufuncs_basic[float-exp] \
-and not test_numpy_ufuncs_basic[float-exp2] \
-and not test_numpy_ufuncs_basic[float-expm1] \
-and not test_numpy_ufuncs_basic[float-log] \
-and not test_numpy_ufuncs_basic[float-log2] \
-and not test_numpy_ufuncs_basic[float-log10] \
-and not test_numpy_ufuncs_basic[float-log1p] \
-and not test_numpy_ufuncs_basic[float-sqrt] \
-and not test_numpy_ufuncs_basic[float-sin] \
-and not test_numpy_ufuncs_basic[float-cos] \
-and not test_numpy_ufuncs_basic[float-tan] \
-and not test_numpy_ufuncs_basic[float-arcsin] \
-and not test_numpy_ufuncs_basic[float-arccos] \
-and not test_numpy_ufuncs_basic[float-arctan] \
-and not test_numpy_ufuncs_basic[float-sinh] \
-and not test_numpy_ufuncs_basic[float-cosh] \
-and not test_numpy_ufuncs_basic[float-tanh] \
-and not test_numpy_ufuncs_basic[float-arcsinh] \
-and not test_numpy_ufuncs_basic[float-arccosh] \
-and not test_numpy_ufuncs_basic[float-arctanh] \
-and not test_numpy_ufuncs_basic[float-deg2rad] \
-and not test_numpy_ufuncs_basic[float-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float64-exp] \
-and not test_numpy_ufuncs_basic[num_float64-exp2] \
-and not test_numpy_ufuncs_basic[num_float64-expm1] \
-and not test_numpy_ufuncs_basic[num_float64-log] \
-and not test_numpy_ufuncs_basic[num_float64-log2] \
-and not test_numpy_ufuncs_basic[num_float64-log10] \
-and not test_numpy_ufuncs_basic[num_float64-log1p] \
-and not test_numpy_ufuncs_basic[num_float64-sqrt] \
-and not test_numpy_ufuncs_basic[num_float64-sin] \
-and not test_numpy_ufuncs_basic[num_float64-cos] \
-and not test_numpy_ufuncs_basic[num_float64-tan] \
-and not test_numpy_ufuncs_basic[num_float64-arcsin] \
-and not test_numpy_ufuncs_basic[num_float64-arccos] \
-and not test_numpy_ufuncs_basic[num_float64-arctan] \
-and not test_numpy_ufuncs_basic[num_float64-sinh] \
-and not test_numpy_ufuncs_basic[num_float64-cosh] \
-and not test_numpy_ufuncs_basic[num_float64-tanh] \
-and not test_numpy_ufuncs_basic[num_float64-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float64-arccosh] \
-and not test_numpy_ufuncs_basic[num_float64-arctanh] \
-and not test_numpy_ufuncs_basic[num_float64-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float64-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float32-exp] \
-and not test_numpy_ufuncs_basic[num_float32-exp2] \
-and not test_numpy_ufuncs_basic[num_float32-expm1] \
-and not test_numpy_ufuncs_basic[num_float32-log] \
-and not test_numpy_ufuncs_basic[num_float32-log2] \
-and not test_numpy_ufuncs_basic[num_float32-log10] \
-and not test_numpy_ufuncs_basic[num_float32-log1p] \
-and not test_numpy_ufuncs_basic[num_float32-sqrt] \
-and not test_numpy_ufuncs_basic[num_float32-sin] \
-and not test_numpy_ufuncs_basic[num_float32-cos] \
-and not test_numpy_ufuncs_basic[num_float32-tan] \
-and not test_numpy_ufuncs_basic[num_float32-arcsin] \
-and not test_numpy_ufuncs_basic[num_float32-arccos] \
-and not test_numpy_ufuncs_basic[num_float32-arctan] \
-and not test_numpy_ufuncs_basic[num_float32-sinh] \
-and not test_numpy_ufuncs_basic[num_float32-cosh] \
-and not test_numpy_ufuncs_basic[num_float32-tanh] \
-and not test_numpy_ufuncs_basic[num_float32-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float32-arccosh] \
-and not test_numpy_ufuncs_basic[num_float32-arctanh] \
-and not test_numpy_ufuncs_basic[num_float32-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float32-rad2deg] \
-and not test_numpy_ufuncs_basic[nullable_float-exp] \
-and not test_numpy_ufuncs_basic[nullable_float-exp2] \
-and not test_numpy_ufuncs_basic[nullable_float-expm1] \
-and not test_numpy_ufuncs_basic[nullable_float-log] \
-and not test_numpy_ufuncs_basic[nullable_float-log2] \
-and not test_numpy_ufuncs_basic[nullable_float-log10] \
-and not test_numpy_ufuncs_basic[nullable_float-log1p] \
-and not test_numpy_ufuncs_basic[nullable_float-sqrt] \
-and not test_numpy_ufuncs_basic[nullable_float-sin] \
-and not test_numpy_ufuncs_basic[nullable_float-cos] \
-and not test_numpy_ufuncs_basic[nullable_float-tan] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsin] \
-and not test_numpy_ufuncs_basic[nullable_float-arccos] \
-and not test_numpy_ufuncs_basic[nullable_float-arctan] \
-and not test_numpy_ufuncs_basic[nullable_float-sinh] \
-and not test_numpy_ufuncs_basic[nullable_float-cosh] \
-and not test_numpy_ufuncs_basic[nullable_float-tanh] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsinh] \
-and not test_numpy_ufuncs_basic[nullable_float-arccosh] \
-and not test_numpy_ufuncs_basic[nullable_float-arctanh] \
-and not test_numpy_ufuncs_basic[nullable_float-deg2rad] \
-and not test_numpy_ufuncs_basic[nullable_float-rad2deg]"
 
+# TODO: Needs motoserver/moto container running on http://localhost:5000
+TEST_THAT_NEED_MOTO_SERVER="not test_styler_to_s3 \
+and not test_with_s3_url[None] \
+and not test_with_s3_url[gzip] \
+and not test_with_s3_url[bz2] \
+and not test_with_s3_url[zip] \
+and not test_with_s3_url[xz] \
+and not test_with_s3_url[tar] \
+and not test_s3_permission_output[etree] \
+and not test_read_s3_jsonl \
+and not test_s3_parser_consistency \
+and not test_to_s3 \
+and not test_parse_public_s3a_bucket \
+and not test_parse_public_s3_bucket_nrows \
+and not test_parse_public_s3_bucket_chunked \
+and not test_parse_public_s3_bucket_chunked_python \
+and not test_parse_public_s3_bucket_python \
+and not test_infer_s3_compression \
+and not test_parse_public_s3_bucket_nrows_python \
+and not test_read_s3_fails_private \
+and not test_read_csv_handles_boto_s3_object \
+and not test_read_csv_chunked_download \
+and not test_read_s3_with_hash_in_key \
+and not test_read_feather_s3_file_path \
+and not test_parse_public_s3_bucket \
+and not test_parse_private_s3_bucket \
+and not test_parse_public_s3n_bucket \
+and not test_read_with_creds_from_pub_bucket \
+and not test_read_without_creds_from_pub_bucket \
+and not test_from_s3_csv \
+and not test_s3_protocols[s3] \
+and not test_s3_protocols[s3a] \
+and not test_s3_protocols[s3n] \
+and not test_s3_parquet \
+and not test_s3_roundtrip_explicit_fs \
+and not test_s3_roundtrip \
+and not test_s3_roundtrip_for_dir[partition_col0] \
+and not test_s3_roundtrip_for_dir[partition_col1] \
+and not test_s3_roundtrip"
+
+TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
+and not test_large_string_pyarrow \
+and not test_interchange_from_corrected_buffer_dtypes \
+and not test_eof_states"
+
+# TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_overwrite_warns and not test_complex_series_frame_alignment and not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_NUMPY_UFUNCS_BASIC_FLAKY" \
+    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
-    -o xfail_strict=True \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index bfc56319d82..ffd2abb960d 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
 """
+
 import argparse
 import json
 
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
new file mode 100644
index 00000000000..6636ab9e5f8
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Optional
+
+import pyarrow as pa
+import pytest
+
+from cudf._lib import pylibcudf as plc
+
+
+def metadata_from_arrow_array(
+    pa_array: pa.Array,
+) -> Optional[plc.interop.ColumnMetadata]:
+    metadata = None
+    if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
+        metadata = plc.interop.ColumnMetadata(
+            "",
+            # libcudf does not store field names, so just match pyarrow's.
+            [
+                plc.interop.ColumnMetadata(pa_array.type.field(i).name)
+                for i in range(pa_array.type.num_fields)
+            ],
+        )
+    return metadata
+
+
+def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None:
+    """Verify that the pylibcudf array and PyArrow array are equal."""
+    # Nested types require children metadata to be passed to the conversion function.
+    plc_pa = plc.interop.to_arrow(
+        plc_column, metadata=metadata_from_arrow_array(pa_array)
+    )
+
+    if isinstance(plc_pa, pa.ChunkedArray):
+        plc_pa = plc_pa.combine_chunks()
+    if isinstance(pa_array, pa.ChunkedArray):
+        pa_array = pa_array.combine_chunks()
+
+    assert plc_pa.equals(pa_array)
+
+
+def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None:
+    """Verify that the pylibcudf array and PyArrow array are equal."""
+    plc_shape = (plc_table.num_rows(), plc_table.num_columns())
+    assert plc_shape == pa_table.shape
+
+    for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
+        assert_column_eq(plc_col, pa_col)
+
+
+def cudf_raises(expected_exception: BaseException, *args, **kwargs):
+    # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions
+    match = kwargs.get("match", None)
+    if match is None:
+        kwargs["match"] = "CUDF failure at"
+    return pytest.raises(expected_exception, *args, **kwargs)
+
+
+# TODO: Consider moving these type utilities into pylibcudf.types itself.
+def is_signed_integer(plc_dtype: plc.DataType):
+    return (
+        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
+    )
+
+
+def is_unsigned_integer(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.UINT8,
+        plc.TypeId.UINT16,
+        plc.TypeId.UINT32,
+        plc.TypeId.UINT64,
+    )
+
+
+def is_integer(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.INT8,
+        plc.TypeId.INT16,
+        plc.TypeId.INT32,
+        plc.TypeId.INT64,
+    )
+
+
+def is_floating(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.FLOAT32,
+        plc.TypeId.FLOAT64,
+    )
+
+
+def is_boolean(plc_dtype: plc.DataType):
+    return plc_dtype.id() == plc.TypeId.BOOL8
+
+
+def is_string(plc_dtype: plc.DataType):
+    return plc_dtype.id() == plc.TypeId.STRING
+
+
+def is_fixed_width(plc_dtype: plc.DataType):
+    return (
+        is_integer(plc_dtype)
+        or is_floating(plc_dtype)
+        or is_boolean(plc_dtype)
+    )
+
+
+# We must explicitly specify this type via a field to ensure we don't include
+# nullability accidentally.
+DEFAULT_STRUCT_TESTING_TYPE = pa.struct(
+    [pa.field("v", pa.int64(), nullable=False)]
+)
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
new file mode 100644
index 00000000000..6d8284fb3db
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Tell ruff it's OK that some imports occur after the sys.path.insert
+# ruff: noqa: E402
+import os
+import sys
+
+import pyarrow as pa
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
+
+from utils import DEFAULT_STRUCT_TESTING_TYPE
+
+
+# This fixture defines the standard set of types that all tests should default to
+# running on. If there is a need for some tests to run on a different set of types, that
+# type list fixture should also be defined below here if it is likely to be reused
+# across modules. Otherwise it may be defined on a per-module basis.
+@pytest.fixture(
+    scope="session",
+    params=[
+        pa.int64(),
+        pa.float64(),
+        pa.string(),
+        pa.bool_(),
+        pa.list_(pa.int64()),
+        DEFAULT_STRUCT_TESTING_TYPE,
+    ],
+)
+def pa_type(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/cudf/cudf/pylibcudf_tests/pytest.ini
new file mode 100644
index 00000000000..1761c0f011c
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/pytest.ini
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+xfail_strict = true
+filterwarnings =
+    error
+    ignore:::.*xdist.*
+    ignore:::.*pytest.*
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
new file mode 100644
index 00000000000..764720d9de1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf
+from cudf._lib import pylibcudf as plc
+
+VALID_TYPES = [
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+    pa.float32(),
+    pa.float64(),
+    pa.bool_(),
+    pa.timestamp("s"),
+    pa.timestamp("ms"),
+    pa.timestamp("us"),
+    pa.timestamp("ns"),
+    pa.duration("s"),
+    pa.duration("ms"),
+    pa.duration("us"),
+    pa.duration("ns"),
+]
+
+
+@pytest.fixture(params=VALID_TYPES, ids=repr)
+def valid_type(request):
+    return request.param
+
+
+@pytest.fixture
+def valid_column(valid_type):
+    if valid_type == pa.bool_():
+        return pa.array([True, False, True], type=valid_type)
+    return pa.array([1, 2, 3], type=valid_type)
+
+
+def test_from_cuda_array_interface(valid_column):
+    col = plc.column.Column.from_cuda_array_interface_obj(
+        cudf.Series(valid_column)
+    )
+    expect = valid_column
+
+    assert_column_eq(col, expect)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
new file mode 100644
index 00000000000..0bf30f98636
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -0,0 +1,848 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import (
+    DEFAULT_STRUCT_TESTING_TYPE,
+    assert_column_eq,
+    assert_table_eq,
+    cudf_raises,
+    is_fixed_width,
+    is_floating,
+    is_integer,
+    is_string,
+    metadata_from_arrow_array,
+)
+
+from cudf._lib import pylibcudf as plc
+
+
+# TODO: Test nullable data
+@pytest.fixture(scope="module")
+def pa_input_column(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.array([1, 2, 3], type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.array(["a", "b", "c"], type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.array([True, True, False], type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Add heterogenous sizes
+        return pa.array([[1], [2], [3]], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def input_column(pa_input_column):
+    return plc.interop.from_arrow(pa_input_column)
+
+
+@pytest.fixture(scope="module")
+def pa_index_column():
+    # Index column for testing gather/scatter, always integral.
+    return pa.array([1, 2, 3])
+
+
+@pytest.fixture(scope="module")
+def index_column(pa_index_column):
+    return plc.interop.from_arrow(pa_index_column)
+
+
+@pytest.fixture(scope="module")
+def pa_target_column(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.array([4, 5, 6, 7, 8, 9], type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.array([False, True, True, False, True, False], type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Add heterogenous sizes
+        return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.array(
+            [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
+            type=pa_type,
+        )
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def target_column(pa_target_column):
+    return plc.interop.from_arrow(pa_target_column)
+
+
+@pytest.fixture
+def mutable_target_column(target_column):
+    return target_column.copy()
+
+
+@pytest.fixture(scope="module")
+def pa_source_table(pa_input_column):
+    return pa.table([pa_input_column] * 3, [""] * 3)
+
+
+@pytest.fixture(scope="module")
+def source_table(pa_source_table):
+    return plc.interop.from_arrow(pa_source_table)
+
+
+@pytest.fixture(scope="module")
+def pa_target_table(pa_target_column):
+    return pa.table([pa_target_column] * 3, [""] * 3)
+
+
+@pytest.fixture(scope="module")
+def target_table(pa_target_table):
+    return plc.interop.from_arrow(pa_target_table)
+
+
+@pytest.fixture(scope="module")
+def pa_source_scalar(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.scalar(1, type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.scalar("a", type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.scalar(False, type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Longer list?
+        return pa.scalar([1], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.scalar({"v": 1}, type=pa_type)
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def source_scalar(pa_source_scalar):
+    return plc.interop.from_arrow(pa_source_scalar)
+
+
+@pytest.fixture(scope="module")
+def pa_mask(pa_target_column):
+    return pa.array([True, False] * (len(pa_target_column) // 2))
+
+
+@pytest.fixture(scope="module")
+def mask(pa_mask):
+    return plc.interop.from_arrow(pa_mask)
+
+
+def test_gather(target_table, pa_target_table, index_column, pa_index_column):
+    result = plc.copying.gather(
+        target_table,
+        index_column,
+        plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+    )
+    expected = pa_target_table.take(pa_index_column)
+    assert_table_eq(result, expected)
+
+
+def test_gather_map_has_nulls(target_table):
+    gather_map = plc.interop.from_arrow(pa.array([0, 1, None]))
+    with cudf_raises(ValueError):
+        plc.copying.gather(
+            target_table,
+            gather_map,
+            plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+        )
+
+
+def _pyarrow_index_to_mask(indices, mask_size):
+    # Convert a list of indices to a boolean mask.
+    return pc.is_in(pa.array(range(mask_size)), pa.array(indices))
+
+
+def _pyarrow_boolean_mask_scatter_column(source, mask, target):
+    if isinstance(source, pa.Scalar):
+        # if_else requires array lengths to match exactly or the replacement must be a
+        # scalar, so we use this in the scalar case.
+        return pc.if_else(mask, target, source)
+
+    if isinstance(source, pa.ChunkedArray):
+        source = source.combine_chunks()
+    if isinstance(target, pa.ChunkedArray):
+        target = target.combine_chunks()
+
+    # replace_with_mask accepts a column whose size is the number of true values in
+    # the mask, so we can use it for columnar scatters.
+    return pc.replace_with_mask(target, mask, source)
+
+
+def _pyarrow_boolean_mask_scatter_table(source, mask, target_table):
+    # pyarrow equivalent of cudf's boolean_mask_scatter.
+    return pa.table(
+        [
+            _pyarrow_boolean_mask_scatter_column(r, mask, v)
+            for v, r in zip(target_table, source)
+        ],
+        [""] * target_table.num_columns,
+    )
+
+
+def test_scatter_table(
+    source_table,
+    pa_source_table,
+    index_column,
+    pa_index_column,
+    target_table,
+    pa_target_table,
+):
+    result = plc.copying.scatter(
+        source_table,
+        index_column,
+        target_table,
+    )
+
+    if pa.types.is_list(
+        dtype := pa_target_table[0].type
+    ) or pa.types.is_struct(dtype):
+        # pyarrow does not support scattering with list data. If and when they do,
+        # replace this hardcoding with their implementation.
+        with pytest.raises(pa.ArrowNotImplementedError):
+            _pyarrow_boolean_mask_scatter_table(
+                pa_source_table,
+                _pyarrow_index_to_mask(
+                    pa_index_column, pa_target_table.num_rows
+                ),
+                pa_target_table,
+            )
+
+        if pa.types.is_list(dtype := pa_target_table[0].type):
+            expected = pa.table(
+                [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3
+            )
+        elif pa.types.is_struct(dtype):
+            expected = pa.table(
+                [
+                    pa.array(
+                        [
+                            {"v": 4},
+                            {"v": 1},
+                            {"v": 2},
+                            {"v": 3},
+                            {"v": 8},
+                            {"v": 9},
+                        ],
+                        type=DEFAULT_STRUCT_TESTING_TYPE,
+                    )
+                ]
+                * 3,
+                [""] * 3,
+            )
+    else:
+        expected = _pyarrow_boolean_mask_scatter_table(
+            pa_source_table,
+            _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows),
+            pa_target_table,
+        )
+
+    assert_table_eq(result, expected)
+
+
+def test_scatter_table_num_col_mismatch(
+    source_table, index_column, target_table
+):
+    # Number of columns in source and target must match.
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            plc.Table(source_table.columns()[:2]),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_table_num_row_mismatch(source_table, target_table):
+    # Number of rows in source and scatter map must match.
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            source_table,
+            plc.interop.from_arrow(
+                pa.array(range(source_table.num_rows() * 2))
+            ),
+            target_table,
+        )
+
+
+def test_scatter_table_map_has_nulls(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            source_table,
+            plc.interop.from_arrow(pa.array([None] * source_table.num_rows())),
+            target_table,
+        )
+
+
+def test_scatter_table_type_mismatch(source_table, index_column, target_table):
+    with cudf_raises(TypeError):
+        if is_integer(
+            dtype := target_table.columns()[0].type()
+        ) or is_floating(dtype):
+            pa_array = pa.array([True] * source_table.num_rows())
+        else:
+            pa_array = pa.array([1] * source_table.num_rows())
+        ncol = source_table.num_columns()
+        pa_table = pa.table([pa_array] * ncol, [""] * ncol)
+        plc.copying.scatter(
+            plc.interop.from_arrow(pa_table),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_scalars(
+    source_scalar,
+    pa_source_scalar,
+    index_column,
+    pa_index_column,
+    target_table,
+    pa_target_table,
+):
+    result = plc.copying.scatter(
+        [source_scalar] * target_table.num_columns(),
+        index_column,
+        target_table,
+    )
+
+    expected = _pyarrow_boolean_mask_scatter_table(
+        [pa_source_scalar] * target_table.num_columns(),
+        pc.invert(
+            _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows)
+        ),
+        pa_target_table,
+    )
+
+    assert_table_eq(result, expected)
+
+
+def test_scatter_scalars_num_scalars_mismatch(
+    source_scalar, index_column, target_table
+):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            [source_scalar] * (target_table.num_columns() - 1),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_scalars_map_has_nulls(source_scalar, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            [source_scalar] * target_table.num_columns(),
+            plc.interop.from_arrow(pa.array([None, None])),
+            target_table,
+        )
+
+
+def test_scatter_scalars_type_mismatch(index_column, target_table):
+    with cudf_raises(TypeError):
+        if is_integer(
+            dtype := target_table.columns()[0].type()
+        ) or is_floating(dtype):
+            source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
+        else:
+            source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
+        plc.copying.scatter(
+            source_scalar * target_table.num_columns(),
+            index_column,
+            target_table,
+        )
+
+
+def test_empty_like_column(input_column):
+    result = plc.copying.empty_like(input_column)
+    assert result.type() == input_column.type()
+
+
+def test_empty_like_table(source_table):
+    result = plc.copying.empty_like(source_table)
+    assert result.num_columns() == source_table.num_columns()
+    for icol, rcol in zip(source_table.columns(), result.columns()):
+        assert rcol.type() == icol.type()
+
+
+@pytest.mark.parametrize("size", [None, 10])
+def test_allocate_like(input_column, size):
+    if is_fixed_width(input_column.type()):
+        result = plc.copying.allocate_like(
+            input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size
+        )
+        assert result.type() == input_column.type()
+        assert result.size() == (input_column.size() if size is None else size)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.allocate_like(
+                input_column,
+                plc.copying.MaskAllocationPolicy.RETAIN,
+                size=size,
+            )
+
+
+def test_copy_range_in_place(
+    input_column, pa_input_column, mutable_target_column, pa_target_column
+):
+    if not is_fixed_width(mutable_target_column.type()):
+        with pytest.raises(TypeError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+    else:
+        plc.copying.copy_range_in_place(
+            input_column,
+            mutable_target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+        expected = _pyarrow_boolean_mask_scatter_column(
+            pa_input_column,
+            _pyarrow_index_to_mask(
+                range(len(pa_input_column)), len(pa_target_column)
+            ),
+            pa_target_column,
+        )
+        assert_column_eq(mutable_target_column, expected)
+
+
+def test_copy_range_in_place_out_of_bounds(
+    input_column, mutable_target_column
+):
+    if is_fixed_width(mutable_target_column.type()):
+        with cudf_raises(IndexError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                5,
+                5 + input_column.size(),
+                0,
+            )
+
+
+def test_copy_range_in_place_different_types(mutable_target_column):
+    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_range_in_place(
+            input_column,
+            mutable_target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+
+
+def test_copy_range_in_place_null_mismatch(
+    pa_input_column, mutable_target_column
+):
+    if is_fixed_width(mutable_target_column.type()):
+        pa_input_column = pc.if_else(
+            _pyarrow_index_to_mask([0], len(pa_input_column)),
+            pa_input_column,
+            pa.scalar(None, type=pa_input_column.type),
+        )
+        input_column = plc.interop.from_arrow(pa_input_column)
+        with cudf_raises(ValueError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+
+
+def test_copy_range(
+    input_column, pa_input_column, target_column, pa_target_column
+):
+    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
+        result = plc.copying.copy_range(
+            input_column,
+            target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+        expected = _pyarrow_boolean_mask_scatter_column(
+            pa_input_column,
+            _pyarrow_index_to_mask(
+                range(len(pa_input_column)), len(pa_target_column)
+            ),
+            pa_target_column,
+        )
+        assert_column_eq(result, expected)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.copy_range(
+                input_column,
+                target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+
+
+def test_copy_range_out_of_bounds(input_column, target_column):
+    with cudf_raises(IndexError):
+        plc.copying.copy_range(
+            input_column,
+            target_column,
+            5,
+            5 + input_column.size(),
+            0,
+        )
+
+
+def test_copy_range_different_types(target_column):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_range(
+            input_column,
+            target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+
+
+def test_shift(
+    target_column, pa_target_column, source_scalar, pa_source_scalar
+):
+    shift = 2
+    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
+        result = plc.copying.shift(target_column, shift, source_scalar)
+        expected = pa.concat_arrays(
+            [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
+        )
+        assert_column_eq(result, expected)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.shift(target_column, shift, source_scalar)
+
+
+def test_shift_type_mismatch(target_column):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        fill_value = plc.interop.from_arrow(pa.scalar("a"))
+    else:
+        fill_value = plc.interop.from_arrow(pa.scalar(1))
+
+    with cudf_raises(TypeError):
+        plc.copying.shift(target_column, 2, fill_value)
+
+
+def test_slice_column(target_column, pa_target_column):
+    bounds = list(range(6))
+    upper_bounds = bounds[1::2]
+    lower_bounds = bounds[::2]
+    result = plc.copying.slice(target_column, bounds)
+    for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
+        assert_column_eq(slice_, pa_target_column[lb:ub])
+
+
+def test_slice_column_wrong_length(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.slice(target_column, list(range(5)))
+
+
+def test_slice_column_decreasing(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.slice(target_column, list(range(5, -1, -1)))
+
+
+def test_slice_column_out_of_bounds(target_column):
+    with cudf_raises(IndexError):
+        plc.copying.slice(target_column, list(range(2, 8)))
+
+
+def test_slice_table(target_table, pa_target_table):
+    bounds = list(range(6))
+    upper_bounds = bounds[1::2]
+    lower_bounds = bounds[::2]
+    result = plc.copying.slice(target_table, bounds)
+    for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
+        assert_table_eq(slice_, pa_target_table[lb:ub])
+
+
+def test_split_column(target_column, pa_target_column):
+    upper_bounds = [1, 3, 5]
+    lower_bounds = [0] + upper_bounds[:-1]
+    result = plc.copying.split(target_column, upper_bounds)
+    for lb, ub, split in zip(lower_bounds, upper_bounds, result):
+        assert_column_eq(split, pa_target_column[lb:ub])
+
+
+def test_split_column_decreasing(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.split(target_column, list(range(5, -1, -1)))
+
+
+def test_split_column_out_of_bounds(target_column):
+    with cudf_raises(IndexError):
+        plc.copying.split(target_column, list(range(5, 8)))
+
+
+def test_split_table(target_table, pa_target_table):
+    upper_bounds = [1, 3, 5]
+    lower_bounds = [0] + upper_bounds[:-1]
+    result = plc.copying.split(target_table, upper_bounds)
+    for lb, ub, split in zip(lower_bounds, upper_bounds, result):
+        assert_table_eq(split, pa_target_table[lb:ub])
+
+
+def test_copy_if_else_column_column(
+    target_column, pa_target_column, pa_source_scalar, mask, pa_mask
+):
+    pa_other_column = pa.concat_arrays(
+        [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]]
+    )
+    other_column = plc.interop.from_arrow(pa_other_column)
+
+    result = plc.copying.copy_if_else(
+        target_column,
+        other_column,
+        mask,
+    )
+
+    expected = pc.if_else(
+        pa_mask,
+        pa_target_column,
+        pa_other_column,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_copy_if_else_wrong_type(target_column, mask):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(
+            pa.array(["a"] * target_column.size())
+        )
+    else:
+        input_column = plc.interop.from_arrow(
+            pa.array([1] * target_column.size())
+        )
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_if_else(input_column, target_column, mask)
+
+
+def test_copy_if_else_wrong_type_mask(target_column):
+    with cudf_raises(TypeError):
+        plc.copying.copy_if_else(
+            target_column,
+            target_column,
+            plc.interop.from_arrow(
+                pa.array([1.0, 2.0] * (target_column.size() // 2))
+            ),
+        )
+
+
+def test_copy_if_else_wrong_size(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.copy_if_else(
+            plc.interop.from_arrow(pa.array([1])),
+            target_column,
+            plc.interop.from_arrow(
+                pa.array([True, False] * (target_column.size() // 2))
+            ),
+        )
+
+
+def test_copy_if_else_wrong_size_mask(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.copy_if_else(
+            target_column,
+            target_column,
+            plc.interop.from_arrow(pa.array([True])),
+        )
+
+
+@pytest.mark.parametrize("array_left", [True, False])
+def test_copy_if_else_column_scalar(
+    target_column,
+    pa_target_column,
+    source_scalar,
+    pa_source_scalar,
+    array_left,
+    mask,
+    pa_mask,
+):
+    args = (
+        (target_column, source_scalar)
+        if array_left
+        else (source_scalar, target_column)
+    )
+    result = plc.copying.copy_if_else(
+        *args,
+        mask,
+    )
+
+    pa_args = (
+        (pa_target_column, pa_source_scalar)
+        if array_left
+        else (pa_source_scalar, pa_target_column)
+    )
+    expected = pc.if_else(
+        pa_mask,
+        *pa_args,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_boolean_mask_scatter_from_table(
+    source_table,
+    pa_source_table,
+    target_table,
+    pa_target_table,
+    mask,
+    pa_mask,
+):
+    result = plc.copying.boolean_mask_scatter(
+        source_table,
+        target_table,
+        mask,
+    )
+
+    if pa.types.is_list(
+        dtype := pa_target_table[0].type
+    ) or pa.types.is_struct(dtype):
+        # pyarrow does not support scattering with list data. If and when they do,
+        # replace this hardcoding with their implementation.
+        with pytest.raises(pa.ArrowNotImplementedError):
+            _pyarrow_boolean_mask_scatter_table(
+                pa_source_table, pa_mask, pa_target_table
+            )
+
+        if pa.types.is_list(dtype := pa_target_table[0].type):
+            expected = pa.table(
+                [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3
+            )
+        elif pa.types.is_struct(dtype):
+            expected = pa.table(
+                [
+                    pa.array(
+                        [
+                            {"v": 1},
+                            {"v": 5},
+                            {"v": 2},
+                            {"v": 7},
+                            {"v": 3},
+                            {"v": 9},
+                        ],
+                        type=DEFAULT_STRUCT_TESTING_TYPE,
+                    )
+                ]
+                * 3,
+                [""] * 3,
+            )
+    else:
+        expected = _pyarrow_boolean_mask_scatter_table(
+            pa_source_table, pa_mask, pa_target_table
+        )
+
+    assert_table_eq(result, expected)
+
+
+def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table(source_table.columns()[:2]),
+            target_table,
+            plc.interop.from_arrow(pa.array([True, False] * 3)),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            source_table,
+            target_table,
+            plc.interop.from_arrow(pa.array([True, False] * 2)),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table(source_table.columns()[:2]),
+            target_table,
+            plc.interop.from_arrow(
+                pa.array([True, False] * 2 + [False, False])
+            ),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
+    if is_integer(dtype := target_table.columns()[0].type()) or is_floating(
+        dtype
+    ):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table([input_column] * 3), target_table, mask
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table):
+    with cudf_raises(TypeError):
+        plc.copying.boolean_mask_scatter(
+            source_table,
+            target_table,
+            plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)),
+        )
+
+
+def test_boolean_mask_scatter_from_scalars(
+    source_scalar,
+    pa_source_scalar,
+    target_table,
+    pa_target_table,
+    mask,
+    pa_mask,
+):
+    result = plc.copying.boolean_mask_scatter(
+        [source_scalar] * 3,
+        target_table,
+        mask,
+    )
+
+    expected = _pyarrow_boolean_mask_scatter_table(
+        [pa_source_scalar] * target_table.num_columns(),
+        pc.invert(pa_mask),
+        pa_target_table,
+    )
+
+    assert_table_eq(result, expected)
+
+
+def test_get_element(input_column, pa_input_column):
+    index = 1
+    result = plc.copying.get_element(input_column, index)
+
+    assert (
+        plc.interop.to_arrow(
+            result, metadata_from_arrow_array(pa_input_column)
+        ).as_py()
+        == pa_input_column[index].as_py()
+    )
+
+
+def test_get_element_out_of_bounds(input_column):
+    with cudf_raises(IndexError):
+        plc.copying.get_element(input_column, 100)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
new file mode 100644
index 00000000000..ae01d953df5
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def string_col():
+    return pa.array(
+        ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
+    )
+
+
+def test_to_upper(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_upper(plc_col)
+    expected = pa.compute.utf8_upper(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_to_lower(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_lower(plc_col)
+    expected = pa.compute.utf8_lower(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_swapcase(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.swapcase(plc_col)
+    expected = pa.compute.utf8_swapcase(string_col)
+    assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
new file mode 100644
index 00000000000..f44c4af9bfc
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_data_col():
+    return pa.array(
+        [
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_data_col(pa_data_col):
+    return plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture(scope="module")
+def pa_target_col():
+    return pa.array(
+        [
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # find
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # rfind
+            "ab",
+            "12",
+            "BC",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # contains
+            "ab",
+            "ABC",
+            "AB",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # starts_with
+            "3",
+            "23",
+            "a23",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # ends_with
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_target_col(pa_target_col):
+    return plc.interop.from_arrow(pa_target_col)
+
+
+@pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module")
+def pa_target_scalar(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture(scope="module")
+def plc_target_scalar(pa_target_scalar):
+    return plc.interop.from_arrow(pa_target_scalar)
+
+
+def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.find(pa_target_scalar.as_py()) if elem is not None else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def colwise_apply(pa_data_col, pa_target_col, operator):
+    def handle_none(st, target):
+        # Match libcudf handling of nulls
+        if st is None:
+            return None
+        elif target is None:
+            return False
+        else:
+            return operator(st, target)
+
+    expected = pa.array(
+        [
+            handle_none(elem, target)
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.bool_(),
+    )
+
+    return expected
+
+
+def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
+    expected = pa.array(
+        [
+            elem.find(target) if not (elem is None or target is None) else None
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.int32(),
+    )
+
+    got = plc.strings.find.find(plc_data_col, plc_target_col, 0)
+    assert_column_eq(got, expected)
+
+
+def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.rfind(py_target)
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.contains(plc_data_col, plc_target_scalar)
+    expected = pa.array(
+        [
+            py_target in elem
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.bool_(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: target in st
+    )
+    got = plc.strings.find.contains(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.starts_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
+    )
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.ends_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
+    )
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index fc253c5c197..dffbbe92fc1 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -8,11 +8,7 @@
 
 import cudf
 from cudf._lib.unary import is_nan
-from cudf.api.types import (
-    _is_categorical_dtype,
-    is_numeric_dtype,
-    is_string_dtype,
-)
+from cudf.api.types import is_numeric_dtype, is_string_dtype
 from cudf.core.missing import NA, NaT
 
 
@@ -86,7 +82,7 @@ def _check_types(
     if (
         exact
         and not isinstance(left, cudf.MultiIndex)
-        and _is_categorical_dtype(left)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
     ):
         if left.dtype != right.dtype:
             raise_assert_detail(
@@ -144,8 +140,8 @@ def assert_column_equal(
     """
     if check_dtype is True:
         if (
-            _is_categorical_dtype(left)
-            and _is_categorical_dtype(right)
+            isinstance(left.dtype, cudf.CategoricalDtype)
+            and isinstance(right.dtype, cudf.CategoricalDtype)
             and not check_categorical
         ):
             pass
@@ -173,7 +169,9 @@ def assert_column_equal(
             return
 
     if check_exact and check_categorical:
-        if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+        if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+            right.dtype, cudf.CategoricalDtype
+        ):
             left_cat = left.categories
             right_cat = right.categories
 
@@ -207,8 +205,8 @@ def assert_column_equal(
 
     if (
         not check_dtype
-        and _is_categorical_dtype(left)
-        and _is_categorical_dtype(right)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
+        and isinstance(right.dtype, cudf.CategoricalDtype)
     ):
         left = left.astype(left.categories.dtype)
         right = right.astype(right.categories.dtype)
@@ -258,7 +256,9 @@ def assert_column_equal(
                 raise e
             else:
                 columns_equal = False
-            if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+            if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+                right.dtype, cudf.CategoricalDtype
+            ):
                 left = left.astype(left.categories.dtype)
                 right = right.astype(right.categories.dtype)
     if not columns_equal:
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet
new file mode 100644
index 00000000000..a80ce5fbd25
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet differ
diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
index 20ef3cc5578..efde6ff11bf 100644
Binary files a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet and b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet differ
diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
index 06777c8e6af..ad81609470c 100644
--- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py
+++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
@@ -1 +1,58 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import contextlib
+from io import BytesIO
+
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.mark.parametrize(
+    "index",
+    [range(1, 11), list(range(1, 11)), range(1, 11)[::2]],
+    ids=["RangeIndex", "IntIndex", "StridedRange"],
+)
+@pytest.mark.parametrize("write_index", [False, True, None])
+@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"])
+def test_dataframe_parquet_roundtrip(index, write_index, empty):
+    if empty:
+        data = {}
+    else:
+        data = {"a": [i * 2 for i in index]}
+    df = cudf.DataFrame(data=data, index=index)
+    pf = pd.DataFrame(data=data, index=index)
+    gpu_buf = BytesIO()
+    cpu_buf = BytesIO()
+
+    df.to_parquet(gpu_buf, index=write_index)
+    pf.to_parquet(cpu_buf, index=write_index)
+    gpu_table = pq.read_table(gpu_buf)
+    cpu_table = pq.read_table(cpu_buf)
+    metadata_equal = (
+        gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata
+    )
+    if empty and write_index is not False:
+        # https://github.com/rapidsai/cudf/issues/15372
+        ctx = pytest.raises(AssertionError)
+    else:
+        ctx = contextlib.nullcontext()
+    with ctx:
+        assert metadata_equal
+
+    gpu_read = cudf.read_parquet(gpu_buf)
+    cpu_read = cudf.read_parquet(cpu_buf)
+    with ctx:
+        assert_eq(gpu_read, cpu_read)
+
+
+@pytest.mark.parametrize("preserve_index", [False, True, None])
+def test_dataframe_to_arrow_preserve_index(preserve_index):
+    df = cudf.DataFrame({"x": ["cat", "dog"] * 5})
+    pf = df.to_pandas()
+    expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema
+    got = df.to_arrow(preserve_index=preserve_index).schema
+    assert expect == got
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
new file mode 100644
index 00000000000..78d7fbfd879
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import itertools
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
+def keys_null(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"])
+def values_null(request):
+    return request.param
+
+
+@pytest.fixture
+def df(keys_null, values_null):
+    keys = ["a", "b", "a", "c", "b", "b", "c", "a"]
+    r = range(len(keys))
+    if keys_null:
+        keys[::3] = itertools.repeat(None, len(r[::3]))
+    values = list(range(len(keys)))
+    if values_null:
+        values[1::3] = itertools.repeat(None, len(r[1::3]))
+    return cudf.DataFrame({"key": keys, "values": values})
+
+
+@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"])
+def test_transform_broadcast(agg, df):
+    pf = df.to_pandas()
+    got = df.groupby("key").transform(agg)
+    expect = pf.groupby("key").transform(agg)
+    assert_eq(got, expect, check_dtype=False)
+
+
+def test_transform_invalid():
+    df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]})
+    with pytest.raises(TypeError):
+        df.groupby("key").transform({"values": "cumprod"})
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 98be7045923..7ef55761b2b 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -203,3 +203,23 @@ def test_tz_aware_attributes_local():
     result = dti.hour
     expected = cudf.Index([9, 9, 9], dtype="int16")
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "item, expected",
+    [
+        ["2020-01-01", False],
+        ["2020-01-01T00:00:00+00:00", True],
+        ["2020-01-01T00:00:00-08:00", False],
+        ["2019-12-31T16:00:00-08:00", True],
+    ],
+)
+def test_contains_tz_aware(item, expected):
+    dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
+    result = item in dti
+    assert result == expected
+
+
+def test_tz_convert_naive_typeerror():
+    with pytest.raises(TypeError):
+        cudf.date_range("2020", periods=2, freq="D").tz_convert(None)
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index 9436d65e0b7..4abe210c6ea 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -33,7 +33,6 @@
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -42,7 +41,6 @@
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -61,7 +59,6 @@
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64, True),
         (np.complex128, True),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64(), True),
         (np.complex128(), True),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.array([], dtype=np.float64), True),
         (np.array([], dtype=np.complex128), True),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -458,7 +446,6 @@ def test_is_integer(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, True),
-        (np.unicode_, True),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -467,7 +454,6 @@ def test_is_integer(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), True),
-        (np.unicode_(), True),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -486,7 +472,6 @@ def test_is_integer(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), True),
-        (np.array([], dtype=np.unicode_), True),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         # (np.array([], dtype=object), False),
@@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, True),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), True),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), True),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
@@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 438f3e35ec8..5d0c403daa2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1710,12 +1710,17 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
 def test_datetime_dateoffset_binaryop(
-    request, n_periods, frequency, dtype, op
+    request, n_periods, frequency, dtype, components, op
 ):
     request.applymarker(
         pytest.mark.xfail(
@@ -1728,9 +1733,9 @@ def test_datetime_dateoffset_binaryop(
     )
 
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()
@@ -1807,14 +1812,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
-def test_datetime_dateoffset_binaryop_reflected(n_periods, frequency, dtype):
+def test_datetime_dateoffset_binaryop_reflected(
+    n_periods, frequency, dtype, components
+):
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()  # converts to nanos
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index cc3e20b5bac..c36595192e4 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy():
         pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type):
         pd.Series([1, 2, 3, 89]),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -848,9 +848,49 @@ def test_empty_series_category_cast(ordered):
     assert_eq(expected.dtype.ordered, actual.dtype.ordered)
 
 
+def test_categorical_dtype_ordered_not_settable():
+    with pytest.raises(AttributeError):
+        cudf.CategoricalDtype().ordered = False
+
+
 @pytest.mark.parametrize("scalar", [1, "a", None, 10.2])
 def test_cat_from_scalar(scalar):
     ps = pd.Series(scalar, dtype="category")
     gs = cudf.Series(scalar, dtype="category")
 
     assert_eq(ps, gs)
+
+
+def test_cat_groupby_fillna():
+    ps = pd.Series(["a", "b", "c"], dtype="category")
+    gs = cudf.from_pandas(ps)
+
+    with pytest.warns(FutureWarning):
+        pg = ps.groupby(ps)
+    gg = gs.groupby(gs)
+
+    assert_exceptions_equal(
+        lfunc=pg.fillna,
+        rfunc=gg.fillna,
+        lfunc_args_and_kwargs=(("d",), {}),
+        rfunc_args_and_kwargs=(("d",), {}),
+    )
+
+
+@pytest.mark.parametrize("op", ["min", "max"])
+def test_categorical_maxima(op):
+    ser = cudf.Series(
+        ["a", "d", "c", "z", "g"],
+        dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False),
+    )
+    assert not ser.cat.ordered
+
+    # Cannot get extrema of unordered Categorical column
+    with pytest.raises(TypeError, match="Categorical is not ordered"):
+        getattr(ser, op)()
+
+    # Max/min should work after converting to "ordered"
+    ser_pd = ser.to_pandas()
+    result = getattr(ser.cat.as_ordered(), op)()
+    result_pd = getattr(ser_pd.cat.as_ordered(), op)()
+    assert_eq(result, result_pd)
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 8e8555b2005..a8a297c155f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
         children=col.base_children,
     )
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
     elif cudf.api.types.is_string_dtype(col.dtype):
@@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
     else:
         pd_series = series.to_pandas()
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         # The cudf.Series is constructed from an already sliced column, whereas
         # the pandas.Series is constructed from the unsliced series and then
         # sliced, so the indexes should be different and we must ignore it.
@@ -176,8 +176,8 @@ def test_column_series_multi_dim(data):
 @pytest.mark.parametrize(
     ("data", "error"),
     [
-        ([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
-        ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
+        ([1, "1.0", "2", -3], cudf.errors.MixedTypeError),
+        ([np.nan, 0, "null", cp.nan], cudf.errors.MixedTypeError),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
             None,
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index a8eac2edf2b..f1f6097d6a9 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -293,3 +293,17 @@ def test_replace_level_values_MultiColumn():
 
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
+
+
+def test_clear_nrows_empty_before():
+    ca = ColumnAccessor({})
+    assert ca.nrows == 0
+    ca.insert("new", [1])
+    assert ca.nrows == 1
+
+
+def test_clear_nrows_empty_after():
+    ca = ColumnAccessor({"new": [1]})
+    assert ca.nrows == 1
+    del ca["new"]
+    assert ca.nrows == 0
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index cdb47ea79d8..4b43a33c8c8 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -9,7 +9,6 @@
 import pytest
 
 import cudf
-from cudf.api.types import _is_categorical_dtype
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     assert_eq,
@@ -219,7 +218,8 @@ def test_concat_columns(axis):
     assert_eq(expect, got, check_index_type=True)
 
 
-def test_concat_multiindex_dataframe():
+@pytest.mark.parametrize("axis", [0, 1])
+def test_concat_multiindex_dataframe(axis):
     gdf = cudf.DataFrame(
         {
             "w": np.arange(4),
@@ -234,14 +234,11 @@ def test_concat_multiindex_dataframe():
     pdg2 = pdg.iloc[:, 1:]
     gdg1 = cudf.from_pandas(pdg1)
     gdg2 = cudf.from_pandas(pdg2)
+    expected = pd.concat([pdg1, pdg2], axis=axis)
+    result = cudf.concat([gdg1, gdg2], axis=axis)
     assert_eq(
-        cudf.concat([gdg1, gdg2]).astype("float64"),
-        pd.concat([pdg1, pdg2]),
-        check_index_type=True,
-    )
-    assert_eq(
-        cudf.concat([gdg1, gdg2], axis=1),
-        pd.concat([pdg1, pdg2], axis=1),
+        expected,
+        result,
         check_index_type=True,
     )
 
@@ -609,8 +606,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     actual = cudf.concat(other_gd, ignore_index=ignore_index)
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].items():
-            if _is_categorical_dtype(col.dtype):
-                if not _is_categorical_dtype(expected[key].dtype):
+            if isinstance(col.dtype, cudf.CategoricalDtype):
+                if not isinstance(expected[key].dtype, pd.CategoricalDtype):
                     # TODO: Pandas bug:
                     # https://github.com/pandas-dev/pandas/issues/42840
                     expected[key] = expected[key].fillna("-1").astype("str")
@@ -1195,10 +1192,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("join", ["inner", "outer"])
-@pytest.mark.parametrize("axis", [0])
 def test_concat_join_empty_dataframes(
-    df, other, ignore_index, axis, join, sort
+    request, df, other, ignore_index, join, sort
 ):
+    axis = 0
     other_pd = [df] + other
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
@@ -1209,50 +1206,27 @@ def test_concat_join_empty_dataframes(
     actual = cudf.concat(
         other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
     )
-    if expected.shape != df.shape:
-        if axis == 0:
-            for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
-                    if not _is_categorical_dtype(expected[key].dtype):
-                        # TODO: Pandas bug:
-                        # https://github.com/pandas-dev/pandas/issues/42840
-                        expected[key] = (
-                            expected[key].fillna("-1").astype("str")
-                        )
-                    else:
-                        expected[key] = (
-                            expected[key]
-                            .cat.add_categories(["-1"])
-                            .fillna("-1")
-                            .astype("str")
-                        )
-                    actual[key] = col.astype("str").fillna("-1")
-                else:
-                    expected[key] = expected[key].fillna(-1)
-                    actual[key] = col.fillna(-1)
-
-            assert_eq(
-                expected.fillna(-1),
-                actual.fillna(-1),
-                check_dtype=False,
-                check_index_type=False
-                if len(expected) == 0 or actual.empty
-                else True,
-                check_column_type=False,
-            )
-        else:
-            # no need to fill in if axis=1
-            assert_eq(
-                expected,
-                actual,
-                check_index_type=False,
-                check_column_type=False,
+    if (
+        join == "outer"
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for dtype in df.dtypes.tolist()
+        )
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for other_df in other
+            for dtype in other_df.dtypes.tolist()
+        )
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/42840"
             )
+        )
     assert_eq(
         expected,
         actual,
         check_dtype=False,
-        check_index_type=False,
         check_column_type=False,
     )
 
@@ -1332,7 +1306,7 @@ def test_concat_join_empty_dataframes_axis_1(
     if expected.shape != df.shape:
         if axis == 0:
             for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
+                if isinstance(expected[key].dtype, pd.CategoricalDtype):
                     expected[key] = expected[key].fillna("-1")
                     actual[key] = col.astype("str").fillna("-1")
             # if not expected.empty:
@@ -1705,7 +1679,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
             cudf.Series(
                 np.arange(
                     "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"
-                ),
+                ).astype("datetime64[s]"),
                 dtype="datetime64[s]",
             ),
             cudf.Series(
@@ -1889,3 +1863,137 @@ def test_concat_mixed_list_types_error(s1, s2):
 
     with pytest.raises(NotImplementedError):
         cudf.concat([s1, s2], ignore_index=True)
+
+
+@pytest.mark.parametrize(
+    "axis",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.xfail(
+                reason="concat dictionaries with axis=0 not implemented"
+            ),
+        ),
+        1,
+        "columns",
+    ],
+)
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})},
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+            "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}),
+            "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}),
+        },
+        pytest.param(
+            {
+                "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}),
+                "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (cudf.Series, {"data": [1, 2, 3]}),
+            "second": (cudf.Series, {"data": [4, 5, 6]}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.Series, {"data": [5, 6], "name": "C"}),
+        },
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [1, 2], "C": [3, 4]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {"D": [5, 6], ("A", "B"): [7, 8]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}},
+            ),
+            "second": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}},
+            ),
+        },
+    ],
+)
+def test_concat_dictionary(d, axis):
+    _dict = {k: c(**v) for k, (c, v) in d.items()}
+    result = cudf.concat(_dict, axis=axis)
+    expected = cudf.from_pandas(
+        pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis)
+    )
+    assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": cudf.Index([1, 2, 3])},
+        {
+            "first": cudf.MultiIndex(
+                levels=[[1, 2], ["blue", "red"]],
+                codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+            )
+        },
+        {"first": cudf.CategoricalIndex([1, 2, 3])},
+    ],
+)
+def test_concat_dict_incorrect_type_index(d):
+    with pytest.raises(
+        TypeError,
+        match="cannot concatenate a dictionary containing indices",
+    ):
+        cudf.concat(d, axis=1)
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index e737a73e86b..0bc9ffa8004 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -7,8 +7,11 @@
 
 import cudf
 from cudf import Series
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
+pytestmark = pytest.mark.spilling
+
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
@@ -302,6 +305,8 @@ def test_series_zero_copy_cow_on():
 
 
 def test_series_zero_copy_cow_off():
+    is_spill_enabled = get_global_manager() is not None
+
     with cudf.option_context("copy_on_write", False):
         s = cudf.Series([1, 2, 3, 4, 5])
         s1 = s.copy(deep=False)
@@ -334,8 +339,12 @@ def test_series_zero_copy_cow_off():
         assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(s1, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
-        assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
-        assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
+        if not is_spill_enabled:
+            # Since spilling might make a copy of the data, we cannot
+            # expect the two series to be a zero-copy of the cupy array
+            # when spilling is enabled globally.
+            assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
+            assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
 
         s4 = cudf.Series([10, 20, 30, 40, 50])
         s5 = cudf.Series(s4)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 2d728fb94ba..5009a7f2628 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep(
     gdf1 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **cudf_arg,
     )
     gdf2 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **pandas_arg,
     )
@@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
     out = read_csv(
         str(fname),
         names=["1", "2", "3"],
-        dtype=["int64", "date", "float64"],
+        dtype=["int64", "datetime64[ns]", "float64"],
         skiprows=1,
         skipfooter=1,
         dayfirst=True,
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 213c6c2c1f9..f98c3ad0475 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,7 +11,12 @@
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import (
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@@ -42,7 +47,9 @@ def test_cuda_array_interface_interop_in(dtype, module):
         assert_eq(pd_data, gdf["test"])
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["str"])
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"]
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out(dtype, module):
     expectation = does_not_raise()
@@ -73,7 +80,9 @@ def to_host_function(x):
         assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out_masked(dtype, module):
     expectation = does_not_raise()
@@ -104,7 +113,9 @@ def to_host_function(x):
         module_data = module_constructor(cudf_data)  # noqa: F841
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"])
 @pytest.mark.parametrize("mask_type", ["bits", "bools"])
 def test_cuda_array_interface_as_column(dtype, nulls, mask_type):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ead1ab2da6c..8b18e53d320 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2769,6 +2769,36 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     assert_eq(pdf2, gdf2)
 
 
+@pytest.mark.parametrize(
+    "index",
+    [
+        None,
+        cudf.RangeIndex(3, name="a"),
+        "a",
+        "b",
+        ["a", "b"],
+        cudf.RangeIndex(0, 5, 2, name="a"),
+    ],
+)
+@pytest.mark.parametrize("preserve_index", [True, False, None])
+def test_arrow_round_trip(preserve_index, index):
+    data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]}
+    if isinstance(index, (list, str)):
+        gdf = cudf.DataFrame(data).set_index(index)
+    else:
+        gdf = cudf.DataFrame(data, index=index)
+
+    table = gdf.to_arrow(preserve_index=preserve_index)
+    table_pd = pa.Table.from_pandas(
+        gdf.to_pandas(), preserve_index=preserve_index
+    )
+
+    gdf_out = cudf.DataFrame.from_arrow(table)
+    pdf_out = table_pd.to_pandas()
+
+    assert_eq(gdf_out, pdf_out)
+
+
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
     np_data = np.arange(10).astype(dtype)
@@ -2794,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
-    expect = pd.Series(pa_chunk_array.to_pandas())
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
+    expect = pa_chunk_array.to_pandas()
     got = cudf.Series(pa_chunk_array)
 
     assert_eq(expect, got)
@@ -2815,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     )
 
     expect = pa_table.to_pandas()
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
     got = cudf.DataFrame.from_arrow(pa_table)
 
     assert_eq(expect, got)
@@ -3178,6 +3196,14 @@ def test_reset_index_unnamed(
     assert_eq(expect, got)
 
 
+def test_reset_index_invalid_level():
+    with pytest.raises(IndexError):
+        cudf.DataFrame([1]).reset_index(level=2)
+
+    with pytest.raises(IndexError):
+        pd.DataFrame([1]).reset_index(level=2)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -3982,44 +4008,28 @@ def test_diff(dtype, period, data_empty):
 
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_isnull_isna(df, nan_as_null):
-    if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
-    ):
-        with pytest.raises(MixedTypeError):
-            cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
-    else:
-        gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
-
-        assert_eq(df.isnull(), gdf.isnull())
-        assert_eq(df.isna(), gdf.isna())
+@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
+def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
+    def detect_nan(x):
+        # Check if the input is a float and if it is nan
+        return x.apply(lambda v: isinstance(v, float) and np.isnan(v))
 
-        # Test individual columns
-        for col in df:
-            assert_eq(df[col].isnull(), gdf[col].isnull())
-            assert_eq(df[col].isna(), gdf[col].isna())
-
-
-@pytest.mark.parametrize("df", _dataframe_na_data())
-@pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_notna_notnull(df, nan_as_null):
+    nan_contains = df.select_dtypes(object).apply(detect_nan)
     if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
+        nan_contains.any().any() and not nan_contains.all().all()
     ):
         with pytest.raises(MixedTypeError):
             cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
     else:
         gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
-        assert_eq(df.notnull(), gdf.notnull())
-        assert_eq(df.notna(), gdf.notna())
+        assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())
 
         # Test individual columns
         for col in df:
-            assert_eq(df[col].notnull(), gdf[col].notnull())
-            assert_eq(df[col].notna(), gdf[col].notna())
+            assert_eq(
+                getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
+            )
 
 
 def test_ndim():
@@ -5169,20 +5179,20 @@ def test_df_constructor_dtype(dtype):
         cudf.DataFrame(
             {
                 "a": [1, 2, 3, 4],
-                "b": [7, np.NaN, 9, 10],
+                "b": [7, np.nan, 9, 10],
                 "c": cudf.Series(
-                    [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False
+                    [np.nan, np.nan, np.nan, np.nan], nan_as_null=False
                 ),
                 "d": cudf.Series([None, None, None, None], dtype="int64"),
                 "e": [100, None, 200, None],
-                "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+                "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
             }
         ),
         cudf.DataFrame(
             {
                 "a": [10, 11, 12, 13, 14, 15],
                 "b": cudf.Series(
-                    [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False
+                    [10, None, np.nan, 2234, None, np.nan], nan_as_null=False
                 ),
             }
         ),
@@ -5234,11 +5244,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op):
     gdf = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4],
-            "b": [7, np.NaN, 9, 10],
-            "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
+            "b": [7, np.nan, 9, 10],
+            "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float),
             "d": cudf.Series([None, None, None, None], dtype="int64"),
             "e": [100, None, 200, None],
-            "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+            "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
         }
     )
 
@@ -5270,7 +5280,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op):
         {
             "a": [10, 11, 12, 13, 14, 15],
             "b": cudf.Series(
-                [10, None, np.NaN, 2234, None, np.NaN],
+                [10, None, np.nan, 2234, None, np.nan],
                 nan_as_null=False,
             ),
         }
@@ -10956,3 +10966,49 @@ def test_squeeze(axis, data):
     result = df.squeeze(axis=axis)
     expected = df.to_pandas().squeeze(axis=axis)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
+@pytest.mark.parametrize(
+    "operation",
+    [
+        lambda df: df.where(df < 2, 2),
+        lambda df: df.nans_to_nulls(),
+        lambda df: df.isna(),
+        lambda df: df.notna(),
+        lambda df: abs(df),
+        lambda df: -df,
+        lambda df: ~df,
+    ],
+)
+def test_op_preserves_column_metadata(column, operation):
+    df = cudf.DataFrame([1], columns=cudf.Index(column))
+    result = operation(df).columns
+    expected = pd.Index(column)
+    pd.testing.assert_index_equal(result, expected, exact=True)
+
+
+def test_dataframe_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert gdf["a"].dtype == np.dtype("float64")
+    pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize("dtype1", ["int16", "float32"])
+@pytest.mark.parametrize("dtype2", ["int16", "float32"])
+def test_dataframe_loc_int_float(dtype1, dtype2):
+    df = cudf.DataFrame(
+        {"a": [10, 11, 12, 13, 14]},
+        index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1),
+    )
+    pdf = df.to_pandas()
+
+    gidx = cudf.Index([2, 3, 4], dtype=dtype2)
+    pidx = gidx.to_pandas()
+
+    actual = df.loc[gidx]
+    expected = pdf.loc[pidx]
+
+    assert_eq(actual, expected, check_index_type=True, check_dtype=True)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7c209078fd2..46a0dcd315d 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2191,9 +2191,8 @@ def test_datetime_index_freq_error(data, dtype, freq):
 
 
 def test_strings_with_utc_offset_not_implemented():
-    with pytest.warns(DeprecationWarning, match="parsing timezone"):  # cupy
-        with pytest.raises(NotImplementedError):
-            DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+    with pytest.raises(NotImplementedError):
+        DatetimeIndex(["2022-07-22 00:00:00+02:00"])
 
 
 @pytest.mark.parametrize("code", ["z", "Z"])
@@ -2227,78 +2226,116 @@ def test_args_not_datetime_typerror(arg):
 
 
 @pytest.mark.parametrize(
-    "data",
+    "data, dtype",
     [
         [
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.000000000",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.001000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.001000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            "2000-01-01 00:00:00.030000000",
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                "2000-01-01 00:00:00.030000000",
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            None,
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                None,
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.000001000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                "2000-01-01 00:00:00.000001000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            None,
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                None,
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            "2000-01-01 00:00:00.000000002",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                "2000-01-01 00:00:00.000000002",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:01.000000000",
-            "2000-01-01 00:00:40.000000000",
-            "2000-01-01 00:00:59.000000000",
+            [
+                "2000-01-01 00:00:01.000000000",
+                "2000-01-01 00:00:40.000000000",
+                "2000-01-01 00:00:59.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:10:00.000000000",
-            "2000-01-01 00:30:40.000000000",
-            "2000-01-01 00:59:00.000000000",
+            [
+                "2000-01-01 00:10:00.000000000",
+                "2000-01-01 00:30:40.000000000",
+                "2000-01-01 00:59:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 07:00:00.000000000",
-            "2000-01-01 08:00:00.000000000",
-            None,
+            [
+                "2000-01-01 07:00:00.000000000",
+                "2000-01-01 08:00:00.000000000",
+                None,
+            ],
+            "datetime64[s]",
         ],
-        [None, None, None],
-        [],
+        [[None, None, None], "datetime64[s]"],
+        [[], "datetime64[s]"],
         [
-            "2000-01-01 00:10:00.123456789",
-            "2000-01-01 00:30:40.123123456",
-            "2000-01-01 00:59:00.675347634",
+            [
+                "2000-01-01 00:10:00.123456789",
+                "2000-01-01 00:30:40.123123456",
+                "2000-01-01 00:59:00.675347634",
+            ],
+            "datetime64[ns]",
         ],
     ],
 )
-@pytest.mark.parametrize("dtype", DATETIME_TYPES)
 def test_datetime_to_str(data, dtype):
     gs = cudf.Series(data, dtype=dtype)
     ps = gs.to_pandas()
@@ -2311,6 +2348,15 @@ def test_datetime_to_str(data, dtype):
     assert_eq(actual.to_pandas(nullable=True), expected)
 
 
+def test_datetime_string_to_datetime_resolution_loss_raises():
+    data = ["2020-01-01 00:00:00.00001"]
+    dtype = "datetime64[s]"
+    with pytest.raises(ValueError):
+        cudf.Series(data, dtype=dtype)
+    with pytest.raises(ValueError):
+        pd.Series(data, dtype=dtype)
+
+
 def test_dateimeindex_from_noniso_string():
     data = ["20160920", "20160925"]
     gdti = cudf.DatetimeIndex(data)
@@ -2357,3 +2403,19 @@ def test_timezone_array_notimplemented():
 def test_to_datetime_errors_ignore_deprecated():
     with pytest.warns(FutureWarning):
         cudf.to_datetime("2001-01-01 00:04:45", errors="ignore")
+
+
+def test_date_range_freq_default():
+    result = pd.date_range("2020-01-01", periods=2, name="foo")
+    expected = cudf.date_range("2020-01-01", periods=2, name="foo")
+    assert_eq(result, expected)
+
+
+def test_date_range_tz():
+    result = pd.date_range("2020-01-01", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", periods=2, tz="UTC")
+    assert_eq(result, expected)
+
+    result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 6e34817c4fd..aafe920d3a1 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import itertools
 from contextlib import ExitStack as does_not_raise
@@ -201,12 +201,7 @@ def test_to_dlpack_mixed_dtypes():
     "shape",
     [
         (0, 3),
-        pytest.param(
-            (3, 0),
-            marks=pytest.mark.xfail(
-                reason="Index information not available via from_dlpack"
-            ),
-        ),
+        (3, 0),
         (0, 0),
     ],
 )
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 06516b6b4ea..674f694a224 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
     dataset = groupby_jit_datasets[dataset]
-
     groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
@@ -1259,7 +1258,7 @@ def test_groupby_unsupported_columns():
     pdg = pdf.groupby("x").sum(numeric_only=True)
     # cudf does not yet support numeric_only, so our default is False (unlike
     # pandas, which defaults to inferring and throws a warning about it).
-    gdg = gdf.groupby("x").sum()
+    gdg = gdf.groupby("x").sum(numeric_only=True)
     assert_groupby_results_equal(pdg, gdg)
 
 
@@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
-@pytest.mark.xfail(
-    reason="https://github.com/pandas-dev/pandas/issues/43209",
-)
 def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
@@ -1904,12 +1900,13 @@ def test_raise_data_error():
     )
 
 
-def test_drop_unsupported_multi_agg():
+def test_multi_agg():
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
+    pdf = gdf.to_pandas()
     assert_groupby_results_equal(
-        gdf.groupby("a").agg(["count", "mean"]),
+        pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
         gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
     )
 
@@ -2158,7 +2155,9 @@ def test_groupby_list_columns_excluded():
     pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True)
 
     assert_groupby_results_equal(
-        pandas_result, gdf.groupby("a").mean(), check_dtype=False
+        pandas_result,
+        gdf.groupby("a").mean(numeric_only=True),
+        check_dtype=False,
     )
 
     assert_groupby_results_equal(
@@ -3826,3 +3825,63 @@ def test_groupby_shift_series_multiindex():
     result = ser.groupby(level=0).shift(1)
     expected = ser.to_pandas().groupby(level=0).shift(1)
     assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"]
+)
+@pytest.mark.parametrize(
+    "by,data",
+    [
+        ("a", {"a": [1, 2, 3]}),
+        (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}),
+        ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}),
+        ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}),
+        (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}),
+        ("b", {"b": ["A", "B", "C"]}),
+    ],
+)
+def test_group_by_reduce_numeric_only(by, data, func):
+    # Test that simple groupby reductions support numeric_only=True
+    df = cudf.DataFrame(data)
+    expected = getattr(df.to_pandas().groupby(by, sort=True), func)(
+        numeric_only=True
+    )
+    result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
+    assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
+)
+def test_group_by_raises_string_error(op):
+    df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "mean",
+        "median",
+        "prod",
+        "sum",
+        list,
+    ],
+)
+def test_group_by_raises_category_error(op):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
+        }
+    )
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05213d7601c..8e7532d044d 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3,6 +3,7 @@
 """
 Test related to Index
 """
+
 import datetime
 import operator
 import re
@@ -1038,7 +1039,9 @@ def test_index_append(data, other):
         (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or len(other) == 0):
+    with expect_warning_if(
+        (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype
+    ):
         actual = gd_data.append(gd_other)
     if len(data) == 0 and len(other) == 0:
         # Pandas default dtype to "object" for empty list
@@ -1236,7 +1239,10 @@ def test_index_append_list(data, other):
         and (any(d.dtype != data.dtype for d in other))
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)):
+    with expect_warning_if(
+        (len(data) == 0 or any(len(d) == 0 for d in other))
+        and (any(d.dtype != data.dtype for d in other))
+    ):
         actual = gd_data.append(gd_other)
 
     assert_eq(expected, actual)
@@ -1517,13 +1523,7 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-    if gdi.dtype == cudf.dtype("datetime64[s]"):
-        # Arrow bug:
-        # https://github.com/apache/arrow/issues/33321
-        # arrow cannot convert non-nanosecond
-        # resolution to appropriate type in pandas.
-        # Hence need to type-cast.
-        expected_index = expected_index.astype(gdi.dtype)
+
     assert_eq(expected_index, gdi)
 
 
@@ -1605,7 +1605,7 @@ def test_rangeindex_name_not_hashable():
 def test_index_rangeindex_search_range():
     # step > 0
     ridx = RangeIndex(-13, 17, 4)
-    ri = ridx.as_range
+    ri = ridx._range
     for i in range(len(ridx)):
         assert i == search_range(ridx[i], ri, side="left")
         assert i + 1 == search_range(ridx[i], ri, side="right")
@@ -1735,6 +1735,10 @@ def test_get_indexer_single_unique_numeric(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1764,6 +1768,12 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(
+            key, method=method, tolerance=None if method is None else tolerance
+        )
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1796,12 +1806,13 @@ def test_get_loc_rangeindex(idx, key):
 @pytest.mark.parametrize(
     "idx",
     [
-        pd.Index([1, 3, 3, 6]),  # monotonic
+        pd.Index([1, 3, 3, 6]),  # monotonic increasing
         pd.Index([6, 1, 3, 3]),  # non-monotonic
+        pd.Index([4, 3, 2, 1, 0]),  # monotonic decreasing
     ],
 )
-@pytest.mark.parametrize("key", [0, 3, 6, 7])
-def test_get_loc_single_duplicate_numeric(idx, key):
+@pytest.mark.parametrize("key", [0, 3, 6, 7, 4])
+def test_get_loc_duplicate_numeric(idx, key):
     pi = idx
     gi = cudf.from_pandas(pi)
 
@@ -1944,6 +1955,11 @@ def test_get_indexer_single_duplicate_string(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2003,6 +2019,11 @@ def test_get_indexer_multi_numeric(idx, key, method):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(key, method=method)
+
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2816,8 +2837,7 @@ def test_index_methods(index, func):
 
     if func == "append":
         expected = pidx.append(other=pidx)
-        with expect_warning_if(len(gidx) == 0):
-            actual = gidx.append(other=gidx)
+        actual = gidx.append(other=gidx)
     else:
         expected = getattr(pidx, func)()
         actual = getattr(gidx, func)()
@@ -3175,3 +3195,74 @@ def test_index_to_pandas_arrow_type(scalar):
     result = idx.to_pandas(arrow_type=True)
     expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
     pd.testing.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
+def test_rangeindex_all(data):
+    result = cudf.RangeIndex(data).all()
+    expected = cudf.Index(list(data)).all()
+    assert result == expected
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
+def test_rangeindex_factorize(sort, data):
+    res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
+    exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
+    assert_eq(res_codes, exp_codes)
+    assert_eq(res_uniques, exp_uniques)
+
+
+def test_rangeindex_dropna():
+    ri = cudf.RangeIndex(range(2))
+    result = ri.dropna()
+    expected = ri.copy()
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
+def test_index_contains_hashable(data):
+    gidx = cudf.Index(data)
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
+
+
+@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"])
+@pytest.mark.parametrize("needle", [0, 1, 2.3])
+def test_index_contains_float_int(data, dtype, needle):
+    gidx = cudf.Index(data=data, dtype=dtype)
+    pidx = gidx.to_pandas()
+
+    actual = needle in gidx
+    expected = needle in pidx
+
+    assert_eq(actual, expected)
+
+
+def test_Index_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gi = cudf.Index([1, 2, 3, np.nan])
+    assert gi.dtype == np.dtype("float64")
+    pi = pd.Index([1, 2, 3, np.nan])
+    assert_eq(pi, gi)
+
+
+def test_index_datetime_repeat():
+    gidx = cudf.date_range("2021-01-01", periods=3, freq="D")
+    pidx = gidx.to_pandas()
+
+    actual = gidx.repeat(5)
+    expected = pidx.repeat(5)
+
+    assert_eq(actual, expected)
+
+    actual = gidx.to_frame().repeat(5)
+
+    assert_eq(actual.index, expected)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5f5c4579e01..009e48a8669 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+import weakref
 from datetime import datetime
 from itertools import combinations
 
@@ -2255,3 +2256,109 @@ def test_scalar_loc_row_categoricalindex():
     result = df.loc["a"]
     expected = df.to_pandas().loc["a"]
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("klass", [cudf.DataFrame, cudf.Series])
+@pytest.mark.parametrize("indexer", ["iloc", "loc"])
+def test_iloc_loc_no_circular_reference(klass, indexer):
+    obj = klass([0])
+    ref = weakref.ref(obj)
+    getattr(obj, indexer)[0]
+    del obj
+    assert ref() is None
+
+
+def test_loc_setitem_empty_dataframe():
+    pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
+    gdf = cudf.from_pandas(pdf)
+    pdf.loc[["index_1"], "new_col"] = "A"
+    gdf.loc[["index_1"], "new_col"] = "A"
+
+    assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [15, 14, 12, 10, 1],
+        [1, 10, 12, 14, 15],
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        10,
+        15,
+        14,
+        0,
+        2,
+    ],
+)
+def test_loc_datetime_monotonic_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    actual = gdf.loc[i:]
+    expected = pdf.loc[i:]
+
+    assert_eq(actual, expected)
+
+    actual = gdf.loc[:i]
+    expected = pdf.loc[:i]
+
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("data", [[15, 14, 3, 10, 1]])
+@pytest.mark.parametrize("scalar", [1, 10, 15, 14, 0, 2])
+def test_loc_datetime_random_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    if i not in pdf.index:
+        assert_exceptions_equal(
+            lambda: pdf.loc[i:],
+            lambda: gdf.loc[i:],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+        assert_exceptions_equal(
+            lambda: pdf.loc[:i],
+            lambda: gdf.loc[:i],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+    else:
+        actual = gdf.loc[i:]
+        expected = pdf.loc[i:]
+
+        assert_eq(actual, expected)
+
+        actual = gdf.loc[:i]
+        expected = pdf.loc[:i]
+
+        assert_eq(actual, expected)
+
+
+def test_sliced_categorical_as_ordered():
+    df = cudf.DataFrame({"a": list("caba"), "b": list(range(4))})
+    df["a"] = df["a"].astype("category")
+    df = df.iloc[:2]
+    result = df["a"].cat.as_ordered()
+    expected = cudf.Series(
+        ["c", "a"],
+        dtype=cudf.CategoricalDtype(list("abc"), ordered=True),
+        name="a",
+    )
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index c063043b72a..f36774daab2 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer():
         result = left.merge(right, how="outer", on="key")
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_inner_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype.categories.dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_left_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_outer_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 40935733f34..51287fe26a0 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -355,6 +355,14 @@ def test_json_lines_basic(json_input, engine):
         np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
+@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"])
+def test_nonexistent_json_correct_error(engine):
+    json_input = "doesnotexist.json"
+    with pytest.raises(FileNotFoundError):
+        cudf.read_json(json_input, engine=engine)
+
+
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
     reason="warning not present in older pandas versions",
@@ -495,9 +503,6 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
 def test_json_engine_selection():
     json = "[1, 2, 3]"
 
@@ -519,10 +524,6 @@ def test_json_engine_selection():
     for col_name in df.columns:
         assert isinstance(col_name, int)
 
-    # should raise an exception
-    with pytest.raises(ValueError):
-        cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy")
-
 
 def test_json_bool_values():
     buffer = "[true,1]\n[false,false]\n[true,true]"
@@ -541,30 +542,6 @@ def test_json_bool_values():
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
 
 
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
-@pytest.mark.parametrize(
-    "buffer",
-    [
-        "[1.0,]\n[null, ]",
-        '{"0":1.0,"1":}\n{"0":null,"1": }',
-        '{ "0" : 1.0 , "1" : }\n{ "0" : null , "1" : }',
-        '{"0":1.0}\n{"1":}',
-    ],
-)
-def test_json_null_literal(buffer):
-    df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy")
-
-    # first column contains a null field, type should be set to float
-    # second column contains only empty fields, type should be set to int8
-    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
-    np.testing.assert_array_equal(
-        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
-    )
-    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
-
-
 def test_json_bad_protocol_string():
     test_string = StringIO('{"field": "s3://path"}')
 
@@ -739,14 +716,8 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
 @pytest.mark.parametrize(
     "engine",
     [
-        pytest.param(
-            "cudf_legacy",
-            marks=pytest.mark.skip(
-                reason="cannot partially set dtypes for cudf json engine"
-            ),
-        ),
-        "pandas",
         "cudf",
+        "pandas",
     ],
 )
 def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 53919a95115..3c627a5fe89 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -4,6 +4,7 @@
 Tests related to is_unique, is_monotonic_increasing &
 is_monotonic_decreasing attributes
 """
+
 import numpy as np
 import pandas as pd
 import pytest
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 4926d79e734..dd731fab8f3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -3,6 +3,7 @@
 """
 Test related to MultiIndex
 """
+
 import datetime
 import itertools
 import operator
@@ -2152,3 +2153,15 @@ def test_index_to_pandas_arrow_type(scalar):
         levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]]
     )
     pd.testing.assert_index_equal(result, expected)
+
+
+def test_multi_index_contains_hashable():
+    gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3]))
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index a9bca7d8b98..b83b8f08a8b 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1833,6 +1833,9 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
     )
 
 
+@pytest.mark.skip(
+    reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802",
+)
 def test_orc_reader_apache_negative_timestamp(datadir):
     path = datadir / "TestOrcFile.apache_timestamp.orc"
 
@@ -1951,3 +1954,16 @@ def test_writer_lz4():
 
     got = pd.read_orc(buffer)
     assert_eq(gdf, got)
+
+
+def test_row_group_alignment(datadir):
+    path = datadir / "TestOrcFile.MapManyNulls.parquet"
+
+    expected = cudf.read_parquet(path)
+
+    buffer = BytesIO()
+    expected.to_orc(buffer)
+
+    got = cudf.read_orc(buffer)
+
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b72fe84359..e32fdacd8d6 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2,6 +2,7 @@
 
 import datetime
 import glob
+import hashlib
 import math
 import os
 import pathlib
@@ -211,7 +212,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
         # Randomly but reproducibly mark subset of rows as invalid
         random.seed(1337)
         mask = random.sample(range(nrows), nvalids)
-        test_pdf[test_pdf.index.isin(mask)] = np.NaN
+        test_pdf[test_pdf.index.isin(mask)] = np.nan
     if dtype:
         test_pdf = test_pdf.astype(dtype)
 
@@ -415,8 +416,15 @@ def num_row_groups(rows, group_size):
     row_group_size = 5
     pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    (
+        num_rows,
+        row_groups,
+        col_names,
+        num_columns,
+        _,  # rowgroup_metadata
+    ) = cudf.io.read_parquet_metadata(fname)
 
+    assert num_columns == len(pdf.columns)
     assert num_rows == len(pdf.index)
     assert row_groups == num_row_groups(num_rows, row_group_size)
     for a, b in zip(col_names, pdf.columns):
@@ -464,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
     # Because of this, we aren't using PyArrow as a reference for testing our
     # row-group selection method since the only way to only select row groups
     # with PyArrow is with the method we use and intend to test.
-    tbl_filtered = pq.read_table(
-        fname, filters=[("1", ">", 60)], use_legacy_dataset=False
-    )
+    tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])
 
     assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
     print(len(df_filtered))
@@ -561,7 +567,9 @@ def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)]
     gdf = cudf.concat(gdf)
@@ -586,7 +594,9 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     # alternate rows between the two sources
     gdf = cudf.read_parquet(
@@ -1299,8 +1309,19 @@ def test_parquet_delta_byte_array(datadir):
     assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
 
 
+# values chosen to exercise:
+#    1 - header only, no bitpacked values
+#    2 - one bitpacked value
+#   23 - one partially filled miniblock
+#   32 - almost full miniblock
+#   33 - one full miniblock
+#   34 - one full miniblock plus one value in new miniblock
+#  128 - almost full block
+#  129 - one full block
+#  130 - one full block plus one value in new block
+# 1000 - multiple blocks
 def delta_num_rows():
-    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+    return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000]
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1400,17 +1421,16 @@ def test_delta_byte_array_roundtrip(
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1467,17 +1487,16 @@ def string_list_gen_wrapped(x, y):
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize(
@@ -1803,7 +1822,9 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
         writer.write_table(gdf)
 
     # Simple check for multiple row-groups
-    nrows, nrow_groups, columns = cudf.io.parquet.read_parquet_metadata(fname)
+    nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata(
+        fname
+    )
     assert nrows == size
     assert nrow_groups > 1
     assert columns == ["a", "b"]
@@ -1869,6 +1890,43 @@ def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs):
     assert s1 > s2
 
 
+@pytest.mark.parametrize("use_dict", [False, True])
+@pytest.mark.parametrize("max_dict_size", [0, 1048576])
+def test_parquet_writer_dictionary_setting(use_dict, max_dict_size):
+    # Simple test for checking the validity of dictionary encoding setting
+    # and behavior of ParquetWriter in cudf.
+    # Write a table with repetitive data with varying dictionary settings.
+    # Make sure the written columns are dictionary-encoded accordingly.
+
+    # Table with repetitive data
+    table = cudf.DataFrame(
+        {
+            "int32": cudf.Series([1024] * 1024, dtype="int64"),
+        }
+    )
+
+    # Write to Parquet using ParquetWriter
+    buffer = BytesIO()
+    writer = ParquetWriter(
+        buffer,
+        use_dictionary=use_dict,
+        max_dictionary_size=max_dict_size,
+    )
+    writer.write_table(table)
+    writer.close()
+
+    # Read encodings from parquet file
+    got = pq.ParquetFile(buffer)
+    encodings = got.metadata.row_group(0).column(0).encodings
+
+    # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled
+    # and dictionary page limit > 0
+    if use_dict is True and max_dict_size > 0:
+        assert "PLAIN_DICTIONARY" in encodings
+    else:
+        assert "PLAIN_DICTIONARY" not in encodings
+
+
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
 def test_parquet_partitioned(tmpdir_factory, cols, filename):
@@ -2442,9 +2500,17 @@ def test_parquet_index(pdf, index):
     run_parquet_index(pdf, index)
 
 
-@pytest.mark.parametrize("index", [None, True])
-@pytest.mark.xfail(
-    reason="https://github.com/rapidsai/cudf/issues/12243",
+@pytest.mark.parametrize(
+    "index",
+    [
+        pytest.param(
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/apache/arrow/issues/40743"
+            ),
+        ),
+        True,
+    ],
 )
 def test_parquet_index_empty(index):
     pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1))
@@ -2786,6 +2852,100 @@ def test_parquet_reader_fixed_bin(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_len_with_dict(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname, use_dictionary=True)
+
+    expect = pd.read_parquet(padf_fname)
+    got = cudf.read_parquet(padf_fname)
+    assert_eq(expect, got)
+
+
+def test_parquet_flba_round_trip(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname)
+
+    # round trip data with cudf
+    cdf = cudf.read_parquet(padf_fname)
+    cdf_fname = tmpdir.join("cdf.parquet")
+    cdf.to_parquet(cdf_fname, column_type_length={"flba": 32})
+
+    # now read back in with pyarrow to test it was written properly by cudf
+    padf2 = pq.read_table(padf_fname)
+    padf3 = pq.read_table(cdf_fname)
+    assert_eq(padf2, padf3)
+    assert_eq(padf2.schema[0].type, padf3.schema[0].type)
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    [
+        "PLAIN",
+        "DICTIONARY",
+        "DELTA_BINARY_PACKED",
+        "BYTE_STREAM_SPLIT",
+        "USE_DEFAULT",
+    ],
+)
+def test_per_column_options(tmpdir, encoding):
+    pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("ilist.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"ilist.list.element": encoding},
+        compression="SNAPPY",
+        skip_compression={"ilist.list.element"},
+    )
+    # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet
+    encoding_name = (
+        "PLAIN_DICTIONARY"
+        if encoding == "DICTIONARY" or encoding == "USE_DEFAULT"
+        else encoding
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding_name in fmd.row_group(0).column(0).encodings
+    assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED"
+    assert fmd.row_group(0).column(1).compression == "SNAPPY"
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"],
+)
+def test_per_column_options_string_col(tmpdir, encoding):
+    pdf = pd.DataFrame({"s": ["a string"], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("strcol.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"s": encoding},
+        compression="SNAPPY",
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding in fmd.row_group(0).column(0).encodings
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 
@@ -2845,7 +3005,9 @@ def test_to_parquet_row_group_size(
         fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
     )
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
     # 8 bytes per row, as the column is int64
     expected_num_rows = max(
         math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
@@ -2853,6 +3015,28 @@ def test_to_parquet_row_group_size(
     assert expected_num_rows == row_groups
 
 
+@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000])
+def test_parquet_row_group_metadata(tmpdir, large_int64_gdf, size_rows):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows)
+
+    # read file metadata from parquet
+    (
+        num_rows,
+        row_groups,
+        _,  # col_names
+        _,  # num_columns
+        row_group_metadata,
+    ) = cudf.io.read_parquet_metadata(fname)
+
+    # length(RowGroupsMetaData) == number of row groups
+    assert len(row_group_metadata) == row_groups
+    # sum of rows in row groups == total rows
+    assert num_rows == sum(
+        [row_group["num_rows"] for row_group in row_group_metadata]
+    )
+
+
 def test_parquet_reader_decimal_columns():
     df = cudf.DataFrame(
         {
@@ -3135,3 +3319,91 @@ def test_parquet_reader_zstd_huff_tables(datadir):
     expected = pa.parquet.read_table(fname).to_pandas()
     actual = cudf.read_parquet(fname)
     assert_eq(actual, expected)
+
+
+def test_parquet_reader_roundtrip_with_arrow_schema():
+    # Ensure that the nested types are faithfully being roundtripped
+    # across Parquet with arrow schema which is used to faithfully
+    # round trip duration types (timedelta64) across Parquet read and write.
+    pdf = pd.DataFrame(
+        {
+            "s": pd.Series([None, None, None], dtype="timedelta64[s]"),
+            "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"),
+            "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": pd.Series([1234, 123, 4123], dtype="int64"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": pd.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write parquet with arrow for now (to write arrow:schema)
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results for reader with schema
+    assert_eq(expected, got)
+
+
+def test_parquet_reader_roundtrip_structs_with_arrow_schema():
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame(
+        {
+            "struct": {
+                "payload": {
+                    "Domain": {
+                        "Name": "abc",
+                        "Id": {"Name": "host", "Value": "127.0.0.8"},
+                        "Duration": datetime.timedelta(minutes=12),
+                    },
+                    "StreamId": "12345678",
+                    "Duration": datetime.timedelta(minutes=4),
+                    "Offset": None,
+                    "Resource": [
+                        {
+                            "Name": "ZoneName",
+                            "Value": "RAPIDS",
+                            "Duration": datetime.timedelta(seconds=1),
+                        }
+                    ],
+                }
+            }
+        }
+    )
+
+    # Reset the buffer and write parquet with arrow
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index ad6e0ac52c5..d7a3fea1273 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -162,3 +162,17 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     assert_resample_results_equal(expect, got)
 
     assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
+
+
+def test_resampling_downsampling_ms():
+    pdf = pd.DataFrame(
+        {
+            "time": pd.date_range("2020-01-01", periods=5, freq="1ns"),
+            "sign": range(5),
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    expected = pdf.resample("10ms", on="time").mean()
+    result = gdf.resample("10ms", on="time").mean()
+    result.index = result.index.astype("datetime64[ns]")
+    assert_eq(result, expected, check_freq=False)
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index d618669755d..daa1e70808f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cudf
-from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
@@ -71,15 +70,10 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     gdf = cudf.from_pandas(pdf)
 
-    got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
+    got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
     got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)
 
     expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
-    # pandas' melt makes the 'variable' column of 'object' type (string)
-    # cuDF's melt makes it Categorical because it doesn't support strings
-    expect["variable"] = expect["variable"].astype(
-        got["variable"].dtype.to_pandas()
-    )
 
     assert_eq(expect, got)
 
@@ -98,11 +92,28 @@ def test_melt_many_columns():
     grid_df_d = cudf.melt(
         df_d, id_vars=["id"], var_name="d", value_name="sales"
     )
-    grid_df_d["d"] = grid_df_d["d"].astype("str")
+    grid_df_d["d"] = grid_df_d["d"]
 
     assert_eq(grid_df, grid_df_d)
 
 
+def test_melt_str_scalar_id_var():
+    data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]}
+    result = cudf.melt(
+        cudf.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    expected = pd.melt(
+        pd.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 48194494260..9aeae566730 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_notnull_notna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -2218,7 +2220,7 @@ def __getitem__(self, key):
 
 
 def test_series_constructor_error_mixed_type():
-    with pytest.raises(pa.ArrowTypeError):
+    with pytest.raises(MixedTypeError):
         cudf.Series(["abc", np.nan, "123"], nan_as_null=False)
 
 
@@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():
 
 def test_bool_series_mixed_dtype_error():
     ps = pd.Series([True, False, None])
+    all_bool_ps = pd.Series([True, False, True], dtype="object")
     # ps now has `object` dtype, which
     # isn't supported by `cudf`.
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(TypeError):
+            cudf.Series(ps)
+        with pytest.raises(TypeError):
+            cudf.from_pandas(ps)
+        with pytest.raises(TypeError):
+            cudf.Series(ps, dtype=bool)
+        expected = cudf.Series(all_bool_ps, dtype=bool)
+        assert_eq(expected, all_bool_ps.astype(bool))
+    nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
+    gs = cudf.Series(nan_bools_mix, nan_as_null=True)
+    assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
     with pytest.raises(TypeError):
-        cudf.Series(ps, nan_as_null=False)
-    with pytest.raises(TypeError):
-        cudf.from_pandas(ps, nan_as_null=False)
+        cudf.Series(nan_bools_mix, nan_as_null=False)
 
 
 @pytest.mark.parametrize(
@@ -2537,7 +2550,7 @@ def test_nan_as_null_from_arrow_objects(klass, data):
 @pytest.mark.parametrize("reso", ["M", "ps"])
 @pytest.mark.parametrize("typ", ["M", "m"])
 def test_series_invalid_reso_dtype(reso, typ):
-    with pytest.raises(NotImplementedError):
+    with pytest.raises(TypeError):
         cudf.Series([], dtype=f"{typ}8[{reso}]")
 
 
@@ -2637,24 +2650,26 @@ def test_series_setitem_mixed_bool_dtype():
 @pytest.mark.parametrize(
     "nat, value",
     [
-        [np.datetime64("nat"), np.datetime64("2020-01-01")],
-        [np.timedelta64("nat"), np.timedelta64(1)],
+        [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")],
+        [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")],
     ],
 )
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_series_np_array_nat_nan_as_nulls(nat, value, request, nan_as_null):
+def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null):
     expected = np.array([nat, value])
-    if expected.dtype.kind == "m":
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=TypeError, reason="timedelta64 not supported by cupy"
-            )
-        )
     ser = cudf.Series(expected, nan_as_null=nan_as_null)
     assert ser[0] is pd.NaT
     assert ser[1] == value
 
 
+def test_series_unitness_np_datetimelike_units():
+    data = np.array([np.timedelta64(1)])
+    with pytest.raises(TypeError):
+        cudf.Series(data)
+    with pytest.raises(TypeError):
+        pd.Series(data)
+
+
 def test_series_duplicate_index_reindex():
     gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
     ps = gs.to_pandas()
@@ -2784,3 +2799,39 @@ def test_squeeze(axis, data):
 def test_squeeze_invalid_axis(axis):
     with pytest.raises(ValueError):
         cudf.Series([1]).squeeze(axis=axis)
+
+
+def test_series_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gs = cudf.Series([1, 2, 3, np.nan])
+    assert gs.dtype == np.dtype("float64")
+    ps = pd.Series([1, 2, 3, np.nan])
+    assert_eq(ps, gs)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timestamp_series_init(data):
+    scalar = pd.Timestamp(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timedelta_series_init(data):
+    scalar = pd.Timedelta(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index f18cb32a091..913a958b4c2 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -32,6 +32,7 @@
     get_global_manager,
     get_rmm_memory_resource_stack,
     set_global_manager,
+    spill_on_demand_globally,
 )
 from cudf.core.buffer.spillable_buffer import (
     SpillableBuffer,
@@ -47,6 +48,22 @@
     )
 
 
+@contextlib.contextmanager
+def set_rmm_memory_pool(nbytes: int):
+    mr = rmm.mr.get_current_device_resource()
+    rmm.mr.set_current_device_resource(
+        rmm.mr.PoolMemoryResource(
+            mr,
+            initial_pool_size=nbytes,
+            maximum_pool_size=nbytes,
+        )
+    )
+    try:
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
 def single_column_df(target="gpu") -> cudf.DataFrame:
     """Create a standard single column dataframe used for testing
 
@@ -120,18 +137,18 @@ def test_spillable_buffer(manager: SpillManager):
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     assert isinstance(buf, SpillableBuffer)
     assert buf.spillable
-    buf.mark_exposed()
-    assert buf.exposed
+    buf.owner.mark_exposed()
+    assert buf.owner.exposed
     assert not buf.spillable
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     # Notice, accessing `__cuda_array_interface__` itself doesn't
     # expose the pointer, only accessing the "data" field exposes
     # the pointer.
     iface = buf.__cuda_array_interface__
-    assert not buf.exposed
+    assert not buf.owner.exposed
     assert buf.spillable
     iface["data"][0]  # Expose pointer
-    assert buf.exposed
+    assert buf.owner.exposed
     assert not buf.spillable
 
 
@@ -141,7 +158,6 @@ def test_spillable_buffer(manager: SpillManager):
         "get_ptr",
         "memoryview",
         "is_spilled",
-        "exposed",
         "spillable",
         "spill_lock",
         "spill",
@@ -210,7 +226,7 @@ def test_spilling_buffer(manager: SpillManager):
     buf = as_buffer(rmm.DeviceBuffer(size=10), exposed=False)
     buf.spill(target="cpu")
     assert buf.is_spilled
-    buf.mark_exposed()  # Expose pointer and trigger unspill
+    buf.owner.mark_exposed()  # Expose pointer and trigger unspill
     assert not buf.is_spilled
     with pytest.raises(ValueError, match="unspillable buffer"):
         buf.spill(target="cpu")
@@ -237,7 +253,7 @@ def _get_manager_in_env(monkeypatch, var_vals):
 def test_environment_variables_spill_off(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "off"), ("CUDF_SPILL_ON_DEMAND", "off")],
+        [("CUDF_SPILL", "off")],
     ) as manager:
         assert manager is None
 
@@ -245,10 +261,9 @@ def test_environment_variables_spill_off(monkeypatch):
 def test_environment_variables_spill_on(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on")],
+        [("CUDF_SPILL", "on"), ("CUDF_SPILL_ON_DEMAND", "off")],
     ) as manager:
         assert isinstance(manager, SpillManager)
-        assert manager._spill_on_demand is True
         assert manager._device_memory_limit is None
         assert manager.statistics.level == 0
 
@@ -256,7 +271,11 @@ def test_environment_variables_spill_on(monkeypatch):
 def test_environment_variables_device_limit(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on"), ("CUDF_SPILL_DEVICE_LIMIT", "1000")],
+        [
+            ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
+            ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
+        ],
     ) as manager:
         assert isinstance(manager, SpillManager)
         assert manager._device_memory_limit == 1000
@@ -269,6 +288,7 @@ def test_environment_variables_spill_stats(monkeypatch, level):
         monkeypatch,
         [
             ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
             ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
             ("CUDF_SPILL_STATS", f"{level}"),
         ],
@@ -529,12 +549,8 @@ def test_serialize_cuda_dataframe(manager: SpillManager):
     assert_eq(df1, df2)
 
 
-@pytest.mark.skip(
-    reason="This test is not safe because other tests may have enabled"
-    "spilling and already modified rmm's global state"
-)
 def test_get_rmm_memory_resource_stack():
-    mr1 = rmm.mr.get_current_device_resource()
+    mr1 = rmm.mr.CudaMemoryResource()
     assert all(
         not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
         for m in get_rmm_memory_resource_stack(mr1)
@@ -560,9 +576,9 @@ def test_df_transpose(manager: SpillManager):
     df1 = cudf.DataFrame({"a": [1, 2]})
     df2 = df1.transpose()
     # For now, all buffers are marked as exposed
-    assert df1._data._data["a"].data.exposed
-    assert df2._data._data[0].data.exposed
-    assert df2._data._data[1].data.exposed
+    assert df1._data._data["a"].data.owner.exposed
+    assert df2._data._data[0].data.owner.exposed
+    assert df2._data._data[1].data.owner.exposed
 
 
 def test_as_buffer_of_spillable_buffer(manager: SpillManager):
@@ -651,7 +667,7 @@ def test_statistics_expose(manager: SpillManager):
     ]
 
     # Expose the first buffer
-    buffers[0].mark_exposed()
+    buffers[0].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 1
     stat = list(manager.statistics.exposes.values())[0]
     assert stat.count == 1
@@ -660,7 +676,7 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose all 10 buffers
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
 
     # The rest of the ptr accesses should accumulate to a single stat
     # because they resolve to the same traceback.
@@ -680,9 +696,91 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose the new buffers and check that they are counted as spilled
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 3
     stat = list(manager.statistics.exposes.values())[2]
     assert stat.count == 10
     assert stat.total_nbytes == buffers[0].nbytes * 10
     assert stat.spilled_nbytes == buffers[0].nbytes * 10
+
+
+def test_spill_on_demand(manager: SpillManager):
+    with set_rmm_memory_pool(1024):
+        a = as_buffer(data=rmm.DeviceBuffer(size=1024))
+        assert isinstance(a, SpillableBuffer)
+        assert not a.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+        with spill_on_demand_globally():
+            b = as_buffer(data=rmm.DeviceBuffer(size=1024))
+            assert a.is_spilled
+            assert not b.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+
+def test_spilling_and_copy_on_write(manager: SpillManager):
+    with cudf.option_context("copy_on_write", True):
+        a: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10))
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Write access trigger copy of `a` into `b` but since `a` is spilled
+        # the copy is done in host memory and `a` remains spilled.
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.is_spilled
+        assert not b.is_spilled
+
+        # Deep copy of the spilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.owner != b.owner
+        assert a.is_spilled
+        assert b.is_spilled
+        a.spill(target="gpu")
+        assert not a.is_spilled
+        assert b.is_spilled
+
+        # Deep copy of the unspilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.spillable
+        assert not a.is_spilled
+        assert not b.is_spilled
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        # Write access trigger copy of `a` into `b` in device memory
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert not b.is_spilled
+        # And `a` and `b` is now seperated with there one spilling status
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert not b.is_spilled
+        b.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Read access with a spill lock unspill `a` and allows copy-on-write
+        with acquire_spill_lock():
+            a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        assert not a.is_spilled
+
+        # Read access without a spill lock exposes `a` and forces a deep copy
+        a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert a.owner.exposed
+        assert not b.owner.exposed
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index b9eb42906e8..27811d0fcde 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -507,7 +507,7 @@ def test_df_corr(method):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
         [],
@@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
     ],
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 2dccd583b23..6ecead862bb 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import random
 import string
@@ -330,9 +330,8 @@ def test_ngrams(n, separator, expected_values):
                 "he",
                 "er",
                 "re",
-                cudf.NA,
             ],
-            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
+            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5],
             False,
         ),
         (
@@ -340,15 +339,12 @@ def test_ngrams(n, separator, expected_values):
             [
                 "thi",
                 "his",
-                cudf.NA,
-                cudf.NA,
                 "boo",
                 "ook",
                 "her",
                 "ere",
-                cudf.NA,
             ],
-            [1, 1, 2, 3, 4, 4, 5, 5, 6],
+            [1, 1, 4, 4, 5, 5],
             False,
         ),
         (
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 68447f423a4..336b92dba4f 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -3,6 +3,7 @@
 """
 Helper functions for parameterized docstring
 """
+
 import functools
 import re
 import string
@@ -209,12 +210,11 @@ def wrapper(func):
 
         Describing a timestamp ``Series``.
 
-        >>> import numpy as np
         >>> s = cudf.Series([
-        ...   np.datetime64("2000-01-01"),
-        ...   np.datetime64("2010-01-01"),
-        ...   np.datetime64("2010-01-01")
-        ... ])
+        ...   "2000-01-01",
+        ...   "2010-01-01",
+        ...   "2010-01-01"
+        ... ], dtype="datetime64[s]")
         >>> s
         0   2000-01-01
         1   2010-01-01
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e9dbc23d767..2aa3129ab30 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -91,6 +91,10 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
+# The NumPy scalar types are a bit of a mess as they align with the C types
+# so for now we use the `sctypes` dict (although it was made private in 2.0)
+_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
+
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8):
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["int"]:
+    for int_dtype in _NUMPY_SCTYPES["int"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
@@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8):
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["uint"]:
+    for int_dtype in _NUMPY_SCTYPES["uint"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
@@ -392,9 +396,9 @@ def get_min_float_dtype(col):
 
 
 def is_mixed_with_object_dtype(lhs, rhs):
-    if cudf.api.types._is_categorical_dtype(lhs.dtype):
+    if isinstance(lhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
-    elif cudf.api.types._is_categorical_dtype(rhs.dtype):
+    elif isinstance(rhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)
 
     return (lhs.dtype == "object" and rhs.dtype != "object") or (
@@ -587,7 +591,7 @@ def find_common_type(dtypes):
 def _dtype_pandas_compatible(dtype):
     """
     A utility function, that returns `str` instead of `object`
-    dtype when pandas comptibility mode is enabled.
+    dtype when pandas compatibility mode is enabled.
     """
     if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"):
         return "str"
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 85abf438efb..1366a0b8e84 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -101,11 +101,13 @@
 Total number of rows
 Number of row groups
 List of column names
+Number of columns
+List of metadata of row groups
 
 Examples
 --------
 >>> import cudf
->>> num_rows, num_row_groups, names = cudf.io.read_parquet_metadata(filename)
+>>> num_rows, num_row_groups, names, num_columns, row_group_metadata = cudf.io.read_parquet_metadata(filename)
 >>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
 >>> df = cudf.concat(df)
 >>> df
@@ -245,7 +247,8 @@
     File name to use for partitioned datasets. Different partitions
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
-    will be used for each file.
+    will be used for each file. This parameter is only supported by 'cudf'
+    engine, and will be ignored by other engines.
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
@@ -275,6 +278,10 @@
 max_page_size_rows: integer or None, default None
     Maximum number of rows of each page of the output.
     If None, 20000 will be used.
+max_dictionary_size: integer or None, default None
+    Maximum size of the dictionary page for each output column chunk. Dictionary
+    encoding for column chunks that exceeds this limit will be disabled.
+    If None, 1048576 (1MB) will be used.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -289,8 +296,8 @@
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
 use_dictionary : bool, default True
     When ``False``, prevents the use of dictionary encoding for Parquet page
-    data. When ``True``, dictionary encoding is preferred when not disabled due
-    to dictionary size constraints.
+    data. When ``True``, dictionary encoding is preferred subject to
+    ``max_dictionary_size`` constraints.
 header_version : {{'1.0', '2.0'}}, default "1.0"
     Controls whether to use version 1.0 or version 2.0 page headers when
     encoding. Version 1.0 is more portable, but version 2.0 enables the
@@ -299,6 +306,22 @@
     If True, writes all columns as `null` in schema.
     If False, columns are written as `null` if they contain null values,
     otherwise as `not null`.
+skip_compression : set, optional, default None
+    If a column name is present in the set, that column will not be compressed,
+    regardless of the ``compression`` setting.
+column_encoding : dict, optional, default None
+    Sets the page encoding to use on a per-column basis. The key is a column
+    name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+    'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+    'USE_DEFAULT'.
+column_type_length : dict, optional, default None
+    Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+    The key is a column name and the value is an integer. The named column
+    will be output as unannotated binary (i.e. the column will behave as if
+    ``output_as_binary`` was set).
+output_as_binary : set, optional, default None
+    If a column name is present in the set, that column will be output as
+    unannotated binary, rather than the default 'UTF-8'.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.
@@ -543,7 +566,7 @@
     function or `StringIO`). Multiple inputs may be provided as a list. If a
     list is specified each list entry may be of a different input type as long
     as each input is of a valid type and all input JSON schema(s) match.
-engine : {{ 'auto', 'cudf', 'cudf_legacy', 'pandas' }}, default 'auto'
+engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters. See notes below.
 orient : string
@@ -690,7 +713,6 @@
 
        This parameter is only supported with ``engine='cudf'``.
 
-    This parameter is only supported in ``cudf`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).
     If `False` string values are parsed into Python strings.
@@ -701,7 +723,22 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+mixed_types_as_string : bool, default False
+
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, mixed type columns are returned as string columns.
+    If `False` parsing mixed type columns will thrown an error.
+prune_columns : bool, default False
+
+    .. admonition:: GPU-accelerated feature
 
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, only return those columns mentioned in the dtype argument.
+    If `False` dtype argument is used a type inference suggestion.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.
@@ -1697,10 +1734,32 @@ def get_reader_filepath_or_buffer(
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
             # path_or_data need not be a filepath like string
+
+            # helper for checking if raw text looks like a json filename
+            compression_extensions = [
+                ".tar",
+                ".tar.gz",
+                ".tar.bz2",
+                ".tar.xz",
+                ".gz",
+                ".bz2",
+                ".zip",
+                ".xz",
+                ".zst",
+                "",
+            ]
+
             if len(paths):
                 if fs.exists(paths[0]):
                     path_or_data = paths if len(paths) > 1 else paths[0]
-                elif not allow_raw_text_input:
+
+                # raise FileNotFound if path looks like json
+                # following pandas
+                # see
+                # https://github.com/pandas-dev/pandas/pull/46718/files#diff-472ce5fe087e67387942e1e1c409a5bc58dde9eb8a2db6877f1a45ae4974f694R724-R729
+                elif not allow_raw_text_input or paths[0].lower().endswith(
+                    tuple(f".json{c}" for c in compression_extensions)
+                ):
                     raise FileNotFoundError(
                         f"{path_or_data} could not be resolved to any files"
                     )
@@ -1844,6 +1903,7 @@ def stringify_pathlike(pathlike):
     """
     Convert any object that implements the fspath protocol
     to a string. Leaves other objects unchanged
+
     Parameters
     ----------
     pathlike
diff --git a/python/cudf/cudf/utils/metadata/__init__.py b/python/cudf/cudf/utils/metadata/__init__.py
deleted file mode 100644
index ccbb16256fb..00000000000
--- a/python/cudf/cudf/utils/metadata/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
diff --git a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto b/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
deleted file mode 100644
index 1bc0fa6f6bd..00000000000
--- a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
+++ /dev/null
@@ -1,62 +0,0 @@
-syntax = "proto2";
-
-message IntegerStatistics  {
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
-  optional double minimum = 1;
-  optional double maximum = 2;
-  optional double sum = 3;
-}
-
-message StringStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  // sum will store the total length of all strings in a stripe
-  optional sint64 sum = 3;
-}
-
-message BucketStatistics {
-  repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  optional string sum = 3;
-}
-
-message DateStatistics {
-  // min,max values saved as days since epoch
-  optional sint32 minimum = 1;
-  optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
-  // min,max values saved as milliseconds since epoch
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 minimumUtc = 3;
-  optional sint64 maximumUtc = 4;
-}
-
-message BinaryStatistics {
-  // sum will store the total binary blob length in a stripe
-  optional sint64 sum = 1;
-}
-
-message ColumnStatistics {
-  optional uint64 numberOfValues = 1;
-  optional IntegerStatistics intStatistics = 2;
-  optional DoubleStatistics doubleStatistics = 3;
-  optional StringStatistics stringStatistics = 4;
-  optional BucketStatistics bucketStatistics = 5;
-  optional DecimalStatistics decimalStatistics = 6;
-  optional DateStatistics dateStatistics = 7;
-  optional BinaryStatistics binaryStatistics = 8;
-  optional TimestampStatistics timestampStatistics = 9;
-  optional bool hasNull = 10;
-}
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f017b46866f..75bceea3034 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -16,6 +16,7 @@
 import pyarrow as pa
 import pytest
 from numba import NumbaDeprecationWarning
+from pytz import utc
 
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable
@@ -25,6 +26,19 @@
 
 import pandas as xpd
 import pandas._testing as tm
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar,
+    EasterMonday,
+    GoodFriday,
+    Holiday,
+    USColumbusDay,
+    USLaborDay,
+    USMartinLutherKingJr,
+    USMemorialDay,
+    USPresidentsDay,
+    USThanksgivingDay,
+    get_calendar,
+)
 
 # Accelerated pandas has the real pandas module as an attribute
 pd = xpd._fsproxy_slow
@@ -365,6 +379,8 @@ def test_pickle_round_trip(dataframe):
 
 
 def test_excel_round_trip(dataframe):
+    pytest.importorskip("openpyxl")
+
     pdf, df = dataframe
     excel_pdf = BytesIO()
     excel_cudf_pandas = BytesIO()
@@ -1197,6 +1213,24 @@ def test_func_namespace():
     assert xpd.concat is xpd.core.reshape.concat.concat
 
 
+def test_register_accessor():
+    @xpd.api.extensions.register_dataframe_accessor("xyz")
+    class XYZ:
+        def __init__(self, obj):
+            self._obj = obj
+
+        @property
+        def foo(self):
+            return "spam"
+
+    # the accessor must be registered with the proxy type,
+    # not the underlying fast or slow type
+    assert "xyz" in xpd.DataFrame.__dict__
+
+    df = xpd.DataFrame()
+    assert df.xyz.foo == "spam"
+
+
 def test_pickle_groupby(dataframe):
     pdf, df = dataframe
     pgb = pdf.groupby("a")
@@ -1205,6 +1239,185 @@ def test_pickle_groupby(dataframe):
     tm.assert_equal(pgb.sum(), gb.sum())
 
 
+def test_numpy_extension_array():
+    np_array = np.array([0, 1, 2, 3])
+    try:
+        xarray = xpd.arrays.NumpyExtensionArray(np_array)
+        array = pd.arrays.NumpyExtensionArray(np_array)
+    except AttributeError:
+        xarray = xpd.arrays.PandasArray(np_array)
+        array = pd.arrays.PandasArray(np_array)
+
+    tm.assert_equal(xarray, array)
+
+
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
+
+
+def test_super_attribute_lookup():
+    # test that we can use super() to access attributes
+    # of the base class when subclassing proxy types
+
+    class Foo(xpd.Series):
+        def max_times_two(self):
+            return super().max() * 2
+
+    s = Foo([1, 2, 3])
+    assert s.max_times_two() == 6
+
+
+def test_floordiv_array_vs_df():
+    xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
+    parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
+
+    xdf = xpd.DataFrame(xarray)
+    pdf = pd.DataFrame(parray)
+
+    actual = xarray.__floordiv__(xdf)
+    expected = parray.__floordiv__(pdf)
+
+    tm.assert_equal(actual, expected)
+
+
+def test_apply_slow_path_udf_references_global_module():
+    def my_apply(df, unused):
+        # `datetime` Raised `KeyError: __import__`
+        datetime.datetime.strptime(df["Minute"], "%H:%M:%S")
+        return pd.to_numeric(1)
+
+    df = xpd.DataFrame({"Minute": ["09:00:00"]})
+    result = df.apply(my_apply, axis=1, unused=True)
+    expected = xpd.Series([1])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops(op):
+    xdf1 = xpd.DataFrame({"a": [10, 11, 12]})
+    xdf2 = xpd.DataFrame({"a": [1, 2, 3]})
+
+    df1 = pd.DataFrame({"a": [10, 11, 12]})
+    df2 = pd.DataFrame({"a": [1, 2, 3]})
+
+    actual = getattr(xdf1, op)(xdf2)
+    expected = getattr(df1, op)(df2)
+
+    tm.assert_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops_series(op):
+    xser1 = xpd.Series([10, 11, 12])
+    xser2 = xpd.Series([1, 2, 3])
+
+    ser1 = pd.Series([10, 11, 12])
+    ser2 = pd.Series([1, 2, 3])
+
+    actual = getattr(xser1, op)(xser2)
+    expected = getattr(ser1, op)(ser2)
+
+    tm.assert_equal(actual, expected)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timestamp(data):
+    xtimestamp = xpd.Timestamp(data)
+    timestamp = pd.Timestamp(data)
+    tm.assert_equal(xtimestamp, timestamp)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timedelta(data):
+    xtimedelta = xpd.Timedelta(data)
+    timedelta = pd.Timedelta(data)
+    tm.assert_equal(xtimedelta, timedelta)
+
+
+def test_abstract_holiday_calendar():
+    class TestCalendar(AbstractHolidayCalendar):
+        def __init__(self, name=None, rules=None) -> None:
+            super().__init__(name=name, rules=rules)
+
+    jan1 = TestCalendar(rules=[Holiday("jan1", year=2015, month=1, day=1)])
+    jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)])
+
+    # Getting holidays for Jan 1 should not alter results for Jan 2.
+    expected = xpd.DatetimeIndex(["01-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan1.holidays(), expected)
+
+    expected2 = xpd.DatetimeIndex(["02-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan2.holidays(), expected2)
+
+
+@pytest.mark.parametrize(
+    "holiday,start,expected",
+    [
+        (USMemorialDay, datetime.datetime(2015, 7, 1), []),
+        (USLaborDay, "2015-09-07", [xpd.Timestamp("2015-09-07")]),
+        (USColumbusDay, "2015-10-12", [xpd.Timestamp("2015-10-12")]),
+        (USThanksgivingDay, "2015-11-26", [xpd.Timestamp("2015-11-26")]),
+        (USMartinLutherKingJr, "2015-01-19", [xpd.Timestamp("2015-01-19")]),
+        (USPresidentsDay, datetime.datetime(2015, 7, 1), []),
+        (GoodFriday, datetime.datetime(2015, 7, 1), []),
+        (EasterMonday, "2015-04-06", [xpd.Timestamp("2015-04-06")]),
+        ("New Year's Day", "2010-12-31", [xpd.Timestamp("2010-12-31")]),
+        ("Independence Day", "2015-07-03", [xpd.Timestamp("2015-07-03")]),
+        ("Veterans Day", "2012-11-11", []),
+        ("Christmas Day", "2011-12-26", [xpd.Timestamp("2011-12-26")]),
+        (
+            "Juneteenth National Independence Day",
+            "2021-06-18",
+            [xpd.Timestamp("2021-06-18")],
+        ),
+        ("Juneteenth National Independence Day", "2022-06-19", []),
+        (
+            "Juneteenth National Independence Day",
+            "2022-06-20",
+            [xpd.Timestamp("2022-06-20")],
+        ),
+    ],
+)
+def test_holidays_within_dates(holiday, start, expected):
+    if isinstance(holiday, str):
+        calendar = get_calendar("USFederalHolidayCalendar")
+        holiday = calendar.rule_from_name(holiday)
+
+    assert list(holiday.dates(start, start)) == expected
+
+    # Verify that timezone info is preserved.
+    assert list(
+        holiday.dates(
+            utc.localize(xpd.Timestamp(start)),
+            utc.localize(xpd.Timestamp(start)),
+        )
+    ) == [utc.localize(dt) for dt in expected]
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index 631ad2f37b2..39bf07c49de 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -439,10 +439,6 @@ def __radd__(self, other):
     assert Bar() + Foo() == "sum"
     assert FooProxy() + BarProxy() == "sum"
     assert BarProxy() + FooProxy() == "sum"
-    assert FooProxy() + Bar() == "sum"
-    assert Bar() + FooProxy() == "sum"
-    assert Foo() + BarProxy() == "sum"
-    assert BarProxy() + Foo() == "sum"
 
 
 def test_slow_attr_still_proxy():
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 4921446ab6b..588398265f2 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,11 +30,14 @@ def test_profiler():
 
     per_function_stats = profiler.per_function_stats
     assert set(per_function_stats) == {
+        "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
-        "DataFrameGroupBy.sum",
+        "GroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
+        "Timedelta",
+        "_Timestamp.__add__",
     }
     for name, func in per_function_stats.items():
         assert (
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 3112db2a720..38aa6eeb24e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,9 +7,8 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "protoc-wheel",
-    "pyarrow==14.0.2.*",
-    "rmm==24.4.*",
+    "pyarrow==16.1.0.*",
+    "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -33,12 +32,11 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pandas>=2.0,<2.2.2dev0",
-    "protobuf>=3.20,<5",
+    "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.1,<15.0.0a0",
+    "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -65,7 +63,7 @@ test = [
     "pytest<8",
     "scipy",
     "tokenizers==0.15.2",
-    "transformers==4.38.1",
+    "transformers==4.39.3",
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index 57b52559f00..fe7f9d0b00d 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -26,8 +26,8 @@ rapids_find_package(
   INSTALL_EXPORT_SET udf-exports
 )
 
-include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-rapids_cpm_libcudacxx(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports)
+include(${rapids-cmake-dir}/cpm/cccl.cmake)
+rapids_cpm_cccl(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports)
 
 add_library(cudf_strings_udf SHARED strings/src/strings/udf/udf_apis.cu)
 target_include_directories(
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 068837d04ee..84a3a32646d 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -7,8 +7,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef extern from "cudf_kafka/kafka_callback.hpp" \
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 2fbaacff7c6..2927dc0aa9a 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
@@ -7,7 +7,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 7369b99aaf4..80e30e000c0 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "scikit_build_core.build"
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.1.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -22,7 +22,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.4.*",
+    "cudf==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/LICENSE b/python/cudf_polars/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_polars/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_polars/README.md b/python/cudf_polars/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_polars/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/VERSION b/python/cudf_polars/cudf_polars/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
new file mode 100644
index 00000000000..74547fe2448
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+An executor for polars logical plans.
+
+This package implements an executor for polars logical plans using
+pylibcudf to execute the plans on device.
+"""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
new file mode 100644
index 00000000000..de26a3eb51c
--- /dev/null
+++ b/python/cudf_polars/pyproject.toml
@@ -0,0 +1,171 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cudf-polars"
+dynamic = ["version"]
+description = "Executor for polars using cudf"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+dependencies = [
+    "cudf==24.6.*",
+    "polars>=0.20.24",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest-cov",
+    "pytest-xdist",
+    "pytest<8",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.dynamic]
+version = {file = "cudf_polars/VERSION"}
+
+[tool.setuptools.packages.find]
+exclude = ["*tests*"]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py39"
+fix = true
+
+[tool.ruff.lint]
+# __init__.py must re-export everything it imports
+ignore-init-module-imports = false
+select = [
+  "E", # pycodestyle
+  "W", # pycodestyle
+  "F", # Pyflakes
+  "B", # flake8-bugbear
+  "C4", # flake8-comprehensions
+  "D", # flake8-docstrings
+  "D213", # Augment NumPy docstring convention: Multi-line docstring summary should start at the second line
+  "D417", # Augment NumPy docstring convention: Missing argument descriptions
+  "I", # isort
+  "ISC", # flake8-implicit-str-concat
+  "INP", # flake8-no-pep420 (namespace packages)
+  "SIM", # flake8-simplify
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "PLC", # pylint-convention
+  "PLE", # pylint-error
+  # Not enabling PLR (pylint-refactor) since it conflicts with other rules
+  "PLW", # pylint-warning
+  "PERF", # perflint
+  "UP", # pyupgrade
+  "PT", # flake8-pytest-style
+  # https://docs.astral.sh/ruff/rules/#flake8-return-ret
+  "RET502", # no implicit return
+  "RET503", # no implicit return
+  "RET504", # no implicit return
+  "RUF", # Ruff-specific rules
+  "PTH", # flake8-use-pathlib
+  "FA", # flake8-future-annotations
+  "PIE", # flake8-pie
+  "TD", # flake8-todos
+  "TRY", # tryceratops
+  "FBT", # flake8-boolean-trap
+]
+
+ignore = [
+  # Line length regulated by formatter
+  "E501",
+  # pydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html
+  "D401", # Relax NumPy docstring convention: First line should be in imperative mood
+  # flake8-pytest-style:
+  "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception
+  # flake8-simplify
+  "SIM108", # Use ternary operator
+  # flake8-todos
+  "TD002", # Missing author in TODO
+  "TD003", # Missing issue link on the line following this TODO
+  # tryceratops
+  "TRY003", # Avoid specifying long messages outside the exception class
+  # Lints below are turned off because of conflicts with the ruff
+  # formatter
+  # See https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
+  "W191", # tab-indentation
+  "E111", # indentation-with-invalid-multiple
+  "E114", # indentation-with-invalid-multiple-comment
+  "E117", # over-indented
+  "D206", # indent-with-spaces
+  "D300", # triple-single-quotes
+  "Q000", # bad-quotes-inline-string
+  "Q001", # bad-quotes-multiline-string
+  "Q002", # bad-quotes-docstring
+  "Q003", # avoidable-escaped-quote
+  "COM812", # missing-trailing-comma
+  "COM819", # prohibited-trailing-comma
+  "ISC001", # single-line-implicit-string-concatenation
+  "ISC002", # multi-line-implicit-string-concatenation
+]
+fixable = ["ALL"]
+
+[tool.ruff.lint.flake8-pytest-style]
+# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
+fixture-parentheses = false
+mark-parentheses = false
+parametrize-names-type = "csv"
+parametrize-values-type = "list"
+parametrize-values-row-type = "tuple"
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.flake8-type-checking]
+strict = true
+
+[tool.ruff.lint.isort]
+case-sensitive = true
+combine-as-imports = true
+order-by-type = true
+known-first-party = ["cudf_polars"]
+default-section = "third-party"
+section-order = [
+  "future",
+  "standard-library",
+  "third-party",
+  "polars",
+  "rapids",
+  "first-party",
+  "local-folder"
+]
+required-imports = ["from __future__ import annotations"]
+
+[tool.ruff.lint.isort.sections]
+polars = ["polars"]
+rapids = ["rmm", "cudf"]
+
+[tool.ruff.format]
+docstring-code-format = true
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index ccaa2543cc3..7786bf98bef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "setuptools.build_meta"
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.4.*",
-    "cudf_kafka==24.4.*",
+    "cudf==24.6.*",
+    "cudf_kafka==24.6.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -103,3 +103,13 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+    "ignore:unclosed <socket.socket:ResourceWarning",
+    "ignore:Port .* is already in use.:UserWarning:distributed",
+    # Should be fixed in the next streamz release
+    # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
+    "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+]
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index c66e85ed2af..04c2ad65b99 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -51,8 +51,9 @@ def inner_func(*args, **kwargs):
     from .expr._collection import DataFrame, Index, Series
 
     groupby_agg = raise_not_implemented_error("groupby_agg")
-    read_text = raise_not_implemented_error("read_text")
+    read_text = DataFrame.read_text
     to_orc = raise_not_implemented_error("to_orc")
+
 else:
     from .core import DataFrame, Index, Series
     from .groupby import groupby_agg
diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 1c21fca51c8..47b22696415 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 
 class StructMethods:
@@ -9,14 +9,17 @@ def field(self, key):
         """
         Extract children of the specified struct column
         in the Series
+
         Parameters
         ----------
         key: int or str
             index/position or field name of the respective
             struct column
+
         Returns
         -------
         Series
+
         Examples
         --------
         >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index c7b4a1c4c6a..d250589e389 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -2,6 +2,7 @@
 
 import warnings
 from collections.abc import Iterator
+from functools import partial
 
 import cupy as cp
 import numpy as np
@@ -306,7 +307,7 @@ def categorical_dtype_cudf(categories=None, ordered=False):
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
 @_dask_cudf_nvtx_annotate
 def tolist_cudf(obj):
-    return obj.to_arrow().to_pylist()
+    return obj.to_pandas().tolist()
 
 
 @is_categorical_dtype_dispatch.register(
@@ -383,18 +384,6 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs):
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
-        obj = obj.copy()
-        obj.index.name = (
-            obj.index.name
-            if obj.index.name is not None
-            else "__index_level_0__"
-        )
-        obj.index = obj.index._as_int_index()
-
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -407,15 +396,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    result = obj.from_arrow(table)
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if "__index_level_0__" in result.index.names:
-        assert len(result.index.names) == 1
-        result.index.name = None
-
-    return result
+    return obj.from_arrow(table)
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
@@ -484,7 +465,6 @@ def sizeof_cudf_series_index(obj):
     def _simple_cudf_encode(_):
         # Basic pickle-based encoding for a partd k-v store
         import pickle
-        from functools import partial
 
         import partd
 
@@ -686,6 +666,28 @@ def from_dict(
             constructor=constructor,
         )
 
+    @staticmethod
+    def read_json(*args, engine="auto", **kwargs):
+        return _default_backend(
+            dd.read_json,
+            *args,
+            engine=(
+                partial(cudf.read_json, engine=engine)
+                if isinstance(engine, str)
+                else engine
+            ),
+            **kwargs,
+        )
+
+    @staticmethod
+    def read_orc(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
+
+        ddf = legacy_read_orc(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
+
 
 # Import/register cudf-specific classes for dask-expr
 try:
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bfe58531a73..3bd455a3a57 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -55,9 +55,20 @@ def __repr__(self):
 
     @_dask_cudf_nvtx_annotate
     def to_dask_dataframe(self, **kwargs):
-        """Create a dask.dataframe object from a dask_cudf object"""
-        nullable = kwargs.get("nullable", False)
-        return self.map_partitions(M.to_pandas, nullable=nullable)
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly
+        when query-planning is active. Please use `*.to_backend("pandas")`
+        to convert the underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
 
 
 concat = dd.concat
@@ -156,6 +167,14 @@ def set_index(
         pre_sorted = sorted
         del sorted
 
+        if divisions == "quantile":
+            warnings.warn(
+                "Using divisions='quantile' is now deprecated. "
+                "Please raise an issue on github if you believe "
+                "this feature is necessary.",
+                FutureWarning,
+            )
+
         if (
             divisions == "quantile"
             or isinstance(divisions, (cudf.DataFrame, cudf.Series))
@@ -733,6 +752,10 @@ def from_dask_dataframe(df):
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
     one.
 
+    WARNING: This API is deprecated, and may not work properly
+    when query-planning is active. Please use `*.to_backend("cudf")`
+    to convert the underlying data to cudf.
+
     Parameters
     ----------
     df : dask.dataframe.DataFrame
@@ -742,7 +765,14 @@ def from_dask_dataframe(df):
     -------
     dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
     """
-    return df.map_partitions(cudf.from_pandas)
+
+    warnings.warn(
+        "The `from_dask_dataframe` API is now deprecated. "
+        "Please use `*.to_backend('cudf')` instead.",
+        FutureWarning,
+    )
+
+    return df.to_backend("cudf")
 
 
 for name in (
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index 826f514a674..a76b655ef42 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -8,6 +8,9 @@
 
 # Register custom expressions and collections
 if QUERY_PLANNING_ON:
+    # Broadly avoid "p2p" and "disk" defaults for now
+    config.set({"dataframe.shuffle.method": "tasks"})
+
     try:
         import dask_cudf.expr._collection
         import dask_cudf.expr._expr
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index b2f92aeddda..f60e4ff81ef 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import warnings
 from functools import cached_property
 
 from dask_expr import (
@@ -17,14 +18,39 @@
 
 import cudf
 
+_LEGACY_WORKAROUND = (
+    "To enable the 'legacy' dask-cudf API, set the "
+    "global 'dataframe.query-planning' config to "
+    "`False` before dask is imported. This can also "
+    "be done by setting an environment variable: "
+    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
+)
+
+
 ##
 ## Custom collection classes
 ##
 
 
-# VarMixin can be removed if cudf#15179 is addressed.
-# See: https://github.com/rapidsai/cudf/issues/15179
-class VarMixin:
+class CudfFrameBase(FrameBase):
+    def to_dask_dataframe(self, **kwargs):
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly.
+        Please use `*.to_backend("pandas")` to convert the
+        underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
+
+    # var can be removed if cudf#15179 is addressed.
+    # See: https://github.com/rapidsai/cudf/issues/15179
     def var(
         self,
         axis=0,
@@ -49,12 +75,30 @@ def var(
         )
 
 
-class DataFrame(VarMixin, DXDataFrame):
+class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
     def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
+    def set_index(
+        self,
+        *args,
+        divisions=None,
+        **kwargs,
+    ):
+        if divisions == "quantile":
+            divisions = None
+            warnings.warn(
+                "Ignoring divisions='quantile'. This option is now "
+                "deprecated. Please use the legacy API and raise an "
+                "issue on github if this feature is necessary."
+                f"\n{_LEGACY_WORKAROUND}",
+                FutureWarning,
+            )
+
+        return super().set_index(*args, divisions=divisions, **kwargs)
+
     def groupby(
         self,
         by,
@@ -71,6 +115,21 @@ def groupby(
                 f"`by` must be a column name or list of columns, got {by}."
             )
 
+        if "as_index" in kwargs:
+            msg = (
+                "The `as_index` argument is now deprecated. All groupby "
+                "results will be consistent with `as_index=True`."
+            )
+
+            if kwargs.pop("as_index") is not True:
+                raise NotImplementedError(
+                    f"{msg} Please reset the index after aggregating, or "
+                    "use the legacy API if `as_index=False` is required.\n"
+                    f"{_LEGACY_WORKAROUND}"
+                )
+            else:
+                warnings.warn(msg, FutureWarning)
+
         return GroupBy(
             self,
             by,
@@ -81,8 +140,20 @@ def groupby(
             **kwargs,
         )
 
+    def to_orc(self, *args, **kwargs):
+        return self.to_legacy_dataframe().to_orc(*args, **kwargs)
+
+    @staticmethod
+    def read_text(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.text import read_text as legacy_read_text
+
+        ddf = legacy_read_text(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
 
-class Series(VarMixin, DXSeries):
+
+class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
         from dask_cudf.expr._groupby import SeriesGroupBy
 
@@ -101,10 +172,41 @@ def struct(self):
         return StructMethods(self)
 
 
-class Index(DXIndex):
+class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+##
+## Support conversion to GPU-backed Array collections
+##
+
+
+try:
+    from dask_expr._backends import create_array_collection
+
+    @get_collection_type.register_lazy("cupy")
+    def _register_cupy():
+        import cupy
+
+        @get_collection_type.register(cupy.ndarray)
+        def get_collection_type_cupy_array(_):
+            return create_array_collection
+
+    @get_collection_type.register_lazy("cupyx")
+    def _register_cupyx():
+        # Needed for cuml
+        from cupyx.scipy.sparse import spmatrix
+
+        @get_collection_type.register(spmatrix)
+        def get_collection_type_csr_matrix(_):
+            return create_array_collection
+
+except ImportError:
+    # Older version of dask-expr.
+    # Implicit conversion to array wont work.
+    pass
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 6def6e23b12..8fccaccb695 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -1,7 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+import functools
 
+import dask_expr._shuffle as _shuffle_module
+from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._reductions import Var
+from dask_expr._expr import Expr, VarColumns
+from dask_expr._reductions import Reduction, Var
+
+from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask.dataframe.dispatch import is_categorical_dtype
 
 ##
 ## Custom expression patching
@@ -25,19 +32,117 @@ def _kwargs(self) -> dict:
 CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs
 
 
-# This patch accounts for differences between
-# numpy and cupy behavior. It may make sense
-# to move this logic upstream.
-_dx_reduction_aggregate = Var.reduction_aggregate
+# The upstream Var code uses `Series.values`, and relies on numpy
+# for most of the logic. Unfortunately, cudf -> cupy conversion
+# is not supported for data containing null values. Therefore,
+# we must implement our own version of Var for now. This logic
+# is mostly copied from dask-cudf.
+
+
+class VarCudf(Reduction):
+    # Uses the parallel version of Welford's online algorithm (Chan '79)
+    # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
+    _parameters = ["frame", "skipna", "ddof", "numeric_only", "split_every"]
+    _defaults = {
+        "skipna": True,
+        "ddof": 1,
+        "numeric_only": False,
+        "split_every": False,
+    }
+
+    @functools.cached_property
+    def _meta(self):
+        return make_meta(
+            meta_nonempty(self.frame._meta).var(
+                skipna=self.skipna, numeric_only=self.numeric_only
+            )
+        )
+
+    @property
+    def chunk_kwargs(self):
+        return dict(skipna=self.skipna, numeric_only=self.numeric_only)
+
+    @property
+    def combine_kwargs(self):
+        return {}
+
+    @property
+    def aggregate_kwargs(self):
+        return dict(ddof=self.ddof)
+
+    @classmethod
+    def reduction_chunk(cls, x, skipna=True, numeric_only=False):
+        kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
+        if skipna or numeric_only:
+            n = x.count(**kwargs)
+            kwargs["skipna"] = skipna
+            avg = x.mean(**kwargs)
+        else:
+            # Not skipping nulls, so might as well
+            # avoid the full `count` operation
+            n = len(x)
+            kwargs["skipna"] = skipna
+            avg = x.sum(**kwargs) / n
+        if numeric_only:
+            # Workaround for cudf bug
+            # (see: https://github.com/rapidsai/cudf/issues/13731)
+            x = x[n.index]
+        m2 = ((x - avg) ** 2).sum(**kwargs)
+        return n, avg, m2
+
+    @classmethod
+    def reduction_combine(cls, parts):
+        n, avg, m2 = parts[0]
+        for i in range(1, len(parts)):
+            n_a, avg_a, m2_a = n, avg, m2
+            n_b, avg_b, m2_b = parts[i]
+            n = n_a + n_b
+            avg = (n_a * avg_a + n_b * avg_b) / n
+            delta = avg_b - avg_a
+            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
+        return n, avg, m2
+
+    @classmethod
+    def reduction_aggregate(cls, vals, ddof=1):
+        vals = cls.reduction_combine(vals)
+        n, _, m2 = vals
+        return m2 / (n - ddof)
+
+
+def _patched_var(
+    self, axis=0, skipna=True, ddof=1, numeric_only=False, split_every=False
+):
+    if axis == 0:
+        if hasattr(self._meta, "to_pandas"):
+            return VarCudf(self, skipna, ddof, numeric_only, split_every)
+        else:
+            return Var(self, skipna, ddof, numeric_only, split_every)
+    elif axis == 1:
+        return VarColumns(self, skipna, ddof, numeric_only)
+    else:
+        raise ValueError(f"axis={axis} not supported. Please specify 0 or 1")
+
+
+Expr.var = _patched_var
+
+
+# Temporary work-around for missing cudf + categorical support
+# See: https://github.com/rapidsai/cudf/issues/11795
+# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06
+
+_original_get_divisions = _shuffle_module._get_divisions
+
 
+def _patched_get_divisions(frame, other, *args, **kwargs):
+    # NOTE: The following two lines contains the "patch"
+    # (we simply convert the partitioning column to pandas)
+    if is_categorical_dtype(other._meta.dtype) and hasattr(
+        other.frame._meta, "to_pandas"
+    ):
+        other = new_collection(other).to_backend("pandas")._expr
 
-def _reduction_aggregate(*args, **kwargs):
-    result = _dx_reduction_aggregate(*args, **kwargs)
-    if result.ndim == 0:
-        # cupy will sometimes produce a 0d array, and
-        # we need to convert it to a scalar.
-        return result.item()
-    return result
+    # Call "original" function
+    return _original_get_divisions(frame, other, *args, **kwargs)
 
 
-Var.reduction_aggregate = staticmethod(_reduction_aggregate)
+_shuffle_module._get_divisions = _patched_get_divisions
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 7f275151f75..65688115b59 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -3,13 +3,57 @@
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
+    SingleAggregation,
 )
 from dask_expr._util import is_scalar
 
+from dask.dataframe.groupby import Aggregation
+
+from cudf.core.groupby.groupby import _deprecate_collect
+
 ##
 ## Custom groupby classes
 ##
 
+
+class ListAgg(SingleAggregation):
+    @staticmethod
+    def groupby_chunk(arg):
+        return arg.agg(list)
+
+    @staticmethod
+    def groupby_aggregate(arg):
+        gb = arg.agg(list)
+        if gb.ndim > 1:
+            for col in gb.columns:
+                gb[col] = gb[col].list.concat()
+            return gb
+        else:
+            return gb.list.concat()
+
+
+list_aggregation = Aggregation(
+    name="list",
+    chunk=ListAgg.groupby_chunk,
+    agg=ListAgg.groupby_aggregate,
+)
+
+
+def _translate_arg(arg):
+    # Helper function to translate args so that
+    # they can be processed correctly by upstream
+    # dask & dask-expr. Right now, the only necessary
+    # translation is list aggregations.
+    if isinstance(arg, dict):
+        return {k: _translate_arg(v) for k, v in arg.items()}
+    elif isinstance(arg, list):
+        return [_translate_arg(x) for x in arg]
+    elif arg in ("collect", "list", list):
+        return list_aggregation
+    else:
+        return arg
+
+
 # TODO: These classes are mostly a work-around for missing
 # `observed=False` support.
 # See: https://github.com/rapidsai/cudf/issues/15173
@@ -41,8 +85,22 @@ def __getitem__(self, key):
         )
         return g
 
+    def collect(self, **kwargs):
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
+
 
 class SeriesGroupBy(DXSeriesGroupBy):
     def __init__(self, *args, observed=None, **kwargs):
         observed = observed if observed is not None else True
         super().__init__(*args, observed=observed, **kwargs)
+
+    def collect(self, **kwargs):
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 43ad4f0fee3..ef47ea436c7 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,6 +15,7 @@
 from dask.utils import funcname
 
 import cudf
+from cudf.core.groupby.groupby import _deprecate_collect
 from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
@@ -28,7 +29,7 @@
     "sum",
     "min",
     "max",
-    "collect",
+    list,
     "first",
     "last",
 )
@@ -164,9 +165,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            self._make_groupby_method_aggs("collect"),
+            self._make_groupby_method_aggs(list),
             split_every,
             split_out,
         )
@@ -308,9 +310,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            {self._slice: "collect"},
+            {self._slice: list},
             split_every,
             split_out,
         )[self._slice]
@@ -472,7 +475,7 @@ def groupby_agg(
 
     This aggregation algorithm only supports the following options
 
-    * "collect"
+    * "list"
     * "count"
     * "first"
     * "last"
@@ -667,8 +670,8 @@ def _redirect_aggs(arg):
         sum: "sum",
         max: "max",
         min: "min",
-        list: "collect",
-        "list": "collect",
+        "collect": list,
+        "list": list,
     }
     if isinstance(arg, dict):
         new_arg = dict()
@@ -704,7 +707,7 @@ def _aggs_optimized(arg, supported: set):
             _global_set = set(arg)
 
         return bool(_global_set.issubset(supported))
-    elif isinstance(arg, str):
+    elif isinstance(arg, (str, type)):
         return arg in supported
     return False
 
@@ -783,6 +786,8 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
         elif agg in OPTIMIZED_AGGS:
             agg_dict[col] = [agg]
         else:
@@ -873,8 +878,8 @@ def _finalize_gb_agg(
                 gb.drop(columns=[sum_name], inplace=True)
             if "count" not in agg_list:
                 gb.drop(columns=[count_name], inplace=True)
-        if "collect" in agg_list:
-            collect_name = _make_name((col, "collect"), sep=sep)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
             gb[collect_name] = gb[collect_name].list.concat()
 
     # Ensure sorted keys if `sort=True`
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 49fea0d7602..bed69f038b0 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -100,7 +100,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
             **kwargs,
         )
 
-    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
+    name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs)
     dsk = {}
     N = 0
     for path, n in zip(paths, nstripes_per_file):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index a2b1d7fc114..dc780478794 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
@@ -84,9 +84,8 @@ def test_read_json_nested(tmp_path):
         }
     )
     kwargs = dict(orient="records", lines=True)
-    with tmp_path / "data.json" as f, dask.config.set(
-        {"dataframe.convert-string": False}
-    ):
+    f = tmp_path / "data.json"
+    with dask.config.set({"dataframe.convert-string": False}):
         df.to_json(f, **kwargs)
         # Ensure engine='cudf' is tested.
         actual = dask_cudf.read_json(f, engine="cudf", **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 8ccb7a7bfe7..457e5546bd9 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index de2a735b2ce..39800145585 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -113,7 +113,7 @@ def test_roundtrip_from_dask_none_index_false(tmpdir):
 @pytest.mark.parametrize("write_meta", [True, False])
 def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     tmpdir = str(tmpdir)
-    gddf = dask_cudf.from_dask_dataframe(ddf)
+    gddf = ddf.to_backend("cudf")
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
     gddf2 = dask_cudf.read_parquet(tmpdir, calculate_divisions=True)
@@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
     pdf = ddf2.compute()
     pdf.to_parquet(fn, engine="pyarrow")
     read_df = dask_cudf.read_parquet(fn)
-    # Workaround until following issue is fixed:
-    # https://github.com/apache/arrow/issues/33321
-    dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
+    dd.assert_eq(ddf2, read_df.compute())
 
 
 @pytest.mark.parametrize("index", [False, None])
@@ -185,7 +183,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions):
     )
 
 
-@xfail_dask_expr("Categorical column support")
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
@@ -193,7 +190,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
     ddf2 = dask_cudf.from_cudf(
         cudf.datasets.timeseries(freq="D"), npartitions=4
     )
-    ddf2.name = ddf2.name.astype("object")
+    # Use assign in lieu of `ddf2.name = ...`
+    # See: https://github.com/dask/dask-expr/issues/1010
+    ddf2 = ddf2.assign(name=ddf2.name.astype("object"))
     ddf2.to_parquet(fn, write_index=index)
     read_df = dask_cudf.read_parquet(
         fn, index=index, calculate_divisions=divisions
@@ -442,7 +441,7 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("dtypes are inconsistent")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
@@ -535,7 +534,7 @@ def test_check_file_size(tmpdir):
         dask_cudf.io.read_parquet(fn, check_file_size=1).compute()
 
 
-@xfail_dask_expr("HivePartitioning cannot be hashed")
+@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index f4a6fabdb60..a67404da4fe 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -10,10 +10,6 @@
 import pytest
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support
-pytestmark = skip_dask_expr()
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -111,7 +107,7 @@ def test_read_csv(s3_base, s3so):
         s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"}
     ):
         df = dask_cudf.read_csv(
-            "s3://daskcsv/*.csv", chunksize="50 B", storage_options=s3so
+            "s3://daskcsv/*.csv", blocksize="50 B", storage_options=s3so
         )
         assert df.a.sum().compute() == 4
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index d3dcd386d0d..8912b7d5da6 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ebb8e4be187..58d28f0597e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
@@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data):
         dsr < dsr
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 @pytest.mark.parametrize("data", [data_cat_3()])
 def test_categorical_compare_ordered(data):
     cat1 = data[0].copy()
@@ -274,7 +273,6 @@ def test_categorical_categories():
     )
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 def test_categorical_as_known():
     df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     df["col_1"] = df["col_1"].astype("category")
@@ -283,7 +281,19 @@ def test_categorical_as_known():
     pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     pdf["col_1"] = pdf["col_1"].astype("category")
     expected = pdf["col_1"].cat.as_known()
-    dd.assert_eq(expected, actual)
+
+    # Note: Categories may be ordered differently in
+    # cudf and pandas. Therefore, we need to compare
+    # the global set of categories (before and after
+    # calling `compute`), then we need to check that
+    # the initial order of rows was preserved.
+    assert set(expected.cat.categories) == set(
+        actual.cat.categories.values_host
+    )
+    assert set(expected.compute().cat.categories) == set(
+        actual.compute().cat.categories.values_host
+    )
+    dd.assert_eq(expected, actual.astype(expected.dtype))
 
 
 def test_str_slice():
@@ -533,7 +543,7 @@ def test_struct_explode(data):
 
 
 def test_tz_localize():
-    data = Series(date_range("2000-04-01", "2000-04-03", freq="H"))
+    data = Series(date_range("2000-04-01", "2000-04-03", freq="h"))
     expect = data.dt.tz_localize(
         "US/Eastern", ambiguous="NaT", nonexistent="NaT"
     )
@@ -550,8 +560,8 @@ def test_tz_localize():
 @pytest.mark.parametrize(
     "data",
     [
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"),
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize(
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize("UTC"),
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize(
             "US/Eastern"
         ),
     ],
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 8a2f3414fd1..7f8a619ae22 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -32,6 +32,30 @@ def test_from_dict_backend_dispatch():
     dd.assert_eq(expect, ddf)
 
 
+def test_to_dask_dataframe_deprecated():
+    gdf = cudf.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, cudf.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            ddf.to_dask_dataframe()._meta,
+            pd.DataFrame,
+        )
+
+
+def test_from_dask_dataframe_deprecated():
+    gdf = pd.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, pd.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            dask_cudf.from_dask_dataframe(ddf)._meta,
+            cudf.DataFrame,
+        )
+
+
 def test_to_backend():
     np.random.seed(0)
     data = {
@@ -207,7 +231,6 @@ def test_set_index(nelem):
         dd.assert_eq(expect, got, check_index=False, check_divisions=False)
 
 
-@xfail_dask_expr("missing support for divisions='quantile'")
 @pytest.mark.parametrize("by", ["a", "b"])
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
@@ -217,7 +240,8 @@ def test_set_index_quantile(nelem, nparts, by):
     df["b"] = np.random.choice(cudf.datasets.names, size=nelem)
     ddf = dd.from_pandas(df, npartitions=nparts)
 
-    got = ddf.set_index(by, divisions="quantile")
+    with pytest.warns(FutureWarning, match="deprecated"):
+        got = ddf.set_index(by, divisions="quantile")
     expect = df.sort_values(by=by).set_index(by)
     dd.assert_eq(got, expect)
 
@@ -776,7 +800,7 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -792,7 +816,7 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -806,7 +830,7 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
@@ -913,3 +937,59 @@ def test_categorical_dtype_round_trip():
     actual = ds.compute()
     expected = pds.compute()
     assert actual.dtype.ordered == expected.dtype.ordered
+
+
+def test_implicit_array_conversion_cupy():
+    s = cudf.Series(range(10))
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return x.values
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    dask.array.assert_eq(result, expect)
+
+
+def test_implicit_array_conversion_cupy_sparse():
+    cupyx = pytest.importorskip("cupyx")
+
+    s = cudf.Series(range(10), dtype="float32")
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return cupyx.scipy.sparse.csr_matrix(x.values)
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    # NOTE: The calculation here doesn't need to make sense.
+    # We just need to make sure we get the right type back.
+    assert type(result) == type(expect)
+
+
+@pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("values", [[1, 5], [1.1, 2.4, 2.3]])
+def test_series_isin(data, values):
+    ser = cudf.Series(data)
+    pddf = dd.from_pandas(ser.to_pandas(), 1)
+    ddf = dask_cudf.from_cudf(ser, 1)
+
+    actual = ddf.isin(values)
+    expected = pddf.isin(values)
+
+    dd.assert_eq(actual, expected)
+
+
+def test_series_isin_error():
+    ser = cudf.Series([1, 2, 3])
+    ddf = dask_cudf.from_cudf(ser, 1)
+    with pytest.raises(TypeError):
+        ser.isin([1, 5, "a"])
+    with pytest.raises(TypeError):
+        ddf.isin([1, 5, "a"]).compute()
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 76703206726..a12481a7bb4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -25,17 +25,24 @@ def test_is_categorical_dispatch():
 
 
 @pytest.mark.parametrize("preserve_index", [True, False])
-def test_pyarrow_conversion_dispatch(preserve_index):
+@pytest.mark.parametrize("index", [None, cudf.RangeIndex(10, name="foo")])
+def test_pyarrow_conversion_dispatch(preserve_index, index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
-    df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df1 = cudf.DataFrame(
+        np.random.randn(10, 3), columns=list("abc"), index=index
+    )
     df2 = from_pyarrow_table_dispatch(
         df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
     )
 
+    # preserve_index=False doesn't retain index metadata
+    if not preserve_index and index is not None:
+        df1.index.name = None
+
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 39eadb45c91..07fdb25dff9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -16,9 +16,9 @@
 dask_cuda = pytest.importorskip("dask_cuda")
 
 
-def more_than_two_gpus():
+def at_least_n_gpus(n):
     ngpus = len(numba.cuda.gpus)
-    return ngpus >= 2
+    return ngpus >= n
 
 
 @pytest.mark.parametrize("delayed", [True, False])
@@ -54,7 +54,7 @@ def test_merge():
 
 
 @pytest.mark.skipif(
-    not more_than_two_gpus(), reason="Machine does not have more than two GPUs"
+    not at_least_n_gpus(2), reason="Machine does not have two GPUs"
 )
 def test_ucx_seriesgroupby():
     pytest.importorskip("ucp")
@@ -97,3 +97,22 @@ def test_p2p_shuffle():
                 ddf.compute().sort_values("x"),
                 check_index=False,
             )
+
+
+@pytest.mark.skipif(
+    not at_least_n_gpus(3),
+    reason="Machine does not have three GPUs",
+)
+def test_unique():
+    # Using `"p2p"` can produce dispatching problems
+    # TODO: Test "p2p" after dask > 2024.4.1 is required
+    # See: https://github.com/dask/dask/pull/11040
+    with dask_cuda.LocalCUDACluster(n_workers=3) as cluster:
+        with Client(cluster):
+            df = cudf.DataFrame({"x": ["a", "b", "c", "a", "a"]})
+            ddf = dask_cudf.from_cudf(df, npartitions=2)
+            dd.assert_eq(
+                df.x.unique(),
+                ddf.x.unique().compute(),
+                check_index=False,
+            )
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 3bb3e3b0bb8..cf916b713b2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -9,21 +9,12 @@
 from dask.utils_test import hlg_layer
 
 import cudf
+from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
-# XFAIL "collect" tests for now
-agg_params = [agg for agg in OPTIMIZED_AGGS if agg != "collect"]
-if QUERY_PLANNING_ON:
-    agg_params.append(
-        # TODO: "collect" not supported with dask-expr yet
-        pytest.param("collect", marks=pytest.mark.xfail)
-    )
-else:
-    agg_params.append("collect")
-
 
 def assert_cudf_groupby_layers(ddf):
     for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
@@ -57,7 +48,13 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+# NOTE: We only want to test aggregation "methods" here,
+# so we need to leave out `list`. We also include a
+# deprecation check for "collect".
+@pytest.mark.parametrize(
+    "aggregation",
+    sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)),
+)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -72,8 +69,9 @@ def test_groupby_basic(series, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    expect = getattr(gdf_grouped, aggregation)()
-    actual = getattr(ddf_grouped, aggregation)()
+    with expect_warning_if(aggregation == "collect"):
+        expect = getattr(gdf_grouped, aggregation)()
+        actual = getattr(ddf_grouped, aggregation)()
 
     if not QUERY_PLANNING_ON:
         assert_cudf_groupby_layers(actual)
@@ -110,7 +108,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
@@ -243,7 +241,7 @@ def test_groupby_split_out(split_out, column):
     gddf = dask_cudf.from_cudf(gdf, npartitions=3)
 
     ddf_result = (
-        ddf.groupby(column)
+        ddf.groupby(column, observed=True)
         .a.mean(split_out=split_out)
         .compute()
         .sort_values()
@@ -298,64 +296,24 @@ def test_groupby_dropna_cudf(dropna, by):
         (False, "a"),
         (False, "b"),
         (False, "c"),
-        pytest.param(
-            False,
-            "d",
-            marks=pytest.mark.xfail(
-                reason="dropna=False is broken in Dask CPU for groupbys on "
-                "categorical columns"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "b"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "c"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (False, "d"),
+        (False, ["a", "b"]),
+        (False, ["a", "c"]),
+        (False, ["a", "d"]),
         (True, "a"),
         (True, "b"),
         (True, "c"),
         (True, "d"),
         (True, ["a", "b"]),
         (True, ["a", "c"]),
-        pytest.param(
-            True,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (True, ["a", "d"]),
         (None, "a"),
         (None, "b"),
         (None, "c"),
         (None, "d"),
         (None, ["a", "b"]),
         (None, ["a", "c"]),
-        pytest.param(
-            None,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (None, ["a", "d"]),
     ],
 )
 def test_groupby_dropna_dask(dropna, by):
@@ -378,10 +336,10 @@ def test_groupby_dropna_dask(dropna, by):
 
     if dropna is None:
         dask_cudf_result = gddf.groupby(by).e.sum()
-        dask_result = ddf.groupby(by).e.sum()
+        dask_result = ddf.groupby(by, observed=True).e.sum()
     else:
         dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum()
-        dask_result = ddf.groupby(by, dropna=dropna).e.sum()
+        dask_result = ddf.groupby(by, dropna=dropna, observed=True).e.sum()
 
     dd.assert_eq(dask_cudf_result, dask_result)
 
@@ -515,7 +473,7 @@ def test_groupby_reset_index_dtype():
     a = df.groupby("a").agg({"b": ["count"]})
 
     assert a.index.dtype == "int8"
-    assert a.reset_index().dtypes[0] == "int8"
+    assert a.reset_index().dtypes.iloc[0] == "int8"
 
 
 def test_groupby_reset_index_names():
@@ -562,9 +520,9 @@ def test_groupby_reset_index_string_name():
 def test_groupby_categorical_key():
     # See https://github.com/rapidsai/cudf/issues/4608
     df = dask.datasets.timeseries()
-    gddf = dask_cudf.from_dask_dataframe(df)
+    gddf = df.to_backend("cudf")
     gddf["name"] = gddf["name"].astype("category")
-    ddf = gddf.to_dask_dataframe()
+    ddf = gddf.to_backend("pandas")
 
     got = gddf.groupby("name", sort=True).agg(
         {"x": ["mean", "max"], "y": ["mean", "count"]}
@@ -573,14 +531,22 @@ def test_groupby_categorical_key():
     # (See: https://github.com/dask/dask/issues/9515)
     expect = (
         ddf.compute()
-        .groupby("name", sort=True)
+        .groupby("name", sort=True, observed=True)
         .agg({"x": ["mean", "max"], "y": ["mean", "count"]})
     )
     dd.assert_eq(expect, got)
 
 
-@xfail_dask_expr("as_index not supported in dask-expr")
-@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "as_index",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=xfail_dask_expr("as_index not supported in dask-expr"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -603,10 +569,19 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
+    # Avoid using as_index when query-planning is enabled
+    if QUERY_PLANNING_ON:
+        with pytest.warns(FutureWarning, match="argument is now deprecated"):
+            # Should warn when `as_index` is used
+            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+        maybe_as_index = {"as_index": as_index} if as_index is False else {}
+    else:
+        maybe_as_index = {"as_index": as_index}
+
     # Check `sort=True` behavior
     if split_out == 1:
         gf = (
-            ddf.groupby(["name", "a"], sort=True, as_index=as_index)
+            ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
                 **split_kwargs,
@@ -628,7 +603,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             )
 
     # Full check (`sort=False`)
-    gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate(
+    gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
         **split_kwargs,
     )
@@ -668,7 +643,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask-expr version needed")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -707,7 +682,7 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Fails on older versions of dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -769,7 +744,7 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Nested renamer not supported in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 42ecc130298..ed291ef31a7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -66,8 +66,12 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     def gather(df, grows):
         grows[df["x"].values[0]] = (set(df.al), set(df.ar))
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     assert got_rows == expect_rows
 
@@ -127,9 +131,13 @@ def gather(df, grows):
 
         grows[df["x"].values[0]] = (cola, colb)
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     for k in expect_rows:
         np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0])
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 96646f85f74..0b7c7855e07 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -11,7 +11,9 @@
 from dask_cudf.tests.utils import xfail_dask_expr
 
 # No dask-expr support
-pytestmark = xfail_dask_expr("limited get_dummy support in dask-expr + cudf")
+pytestmark = xfail_dask_expr(
+    "Newer dask version needed", lt_version="2024.5.0"
+)
 
 
 def test_get_dummies_cat():
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index c3056f2607c..88b15718382 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -84,3 +84,13 @@ def test_rowwise_reductions(data, op):
             check_exact=False,
             check_dtype=op not in ("var", "std"),
         )
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_var_nulls(skipna):
+    # Copied from 10min example notebook
+    # See: https://github.com/rapidsai/cudf/pull/15347
+    s = cudf.Series([1, 2, 3, None, 4])
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+    dd.assert_eq(s.var(skipna=skipna), ds.var(skipna=skipna))
+    dd.assert_eq(s.std(skipna=skipna), ds.std(skipna=skipna))
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9184ad996ad..9bbbbc79561 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -20,12 +20,7 @@
         "a",
         "b",
         "c",
-        pytest.param(
-            "d",
-            marks=xfail_dask_expr(
-                "Dask-expr fails to sort by categorical column."
-            ),
-        ),
+        "d",
         ["a", "b"],
         ["c", "d"],
     ],
@@ -72,7 +67,7 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("dask-expr code path fails with nulls")
+@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index e838b8d63bc..c7dedbb6b4a 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -3,13 +3,20 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging.version import Version
 
+import dask
 import dask.dataframe as dd
 
 import cudf
 
 from dask_cudf.expr import QUERY_PLANNING_ON
 
+if QUERY_PLANNING_ON:
+    DASK_VERSION = Version(dask.__version__)
+else:
+    DASK_VERSION = None
+
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
     df = pd.DataFrame(
@@ -27,9 +34,17 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
 _default_reason = "Not compatible with dask-expr"
 
 
-def skip_dask_expr(reason=_default_reason):
-    return pytest.mark.skipif(QUERY_PLANNING_ON, reason=reason)
+def skip_dask_expr(reason=_default_reason, lt_version=None):
+    if lt_version is not None:
+        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
+    else:
+        skip = QUERY_PLANNING_ON
+    return pytest.mark.skipif(skip, reason=reason)
 
 
-def xfail_dask_expr(reason=_default_reason):
-    return pytest.mark.xfail(QUERY_PLANNING_ON, reason=reason)
+def xfail_dask_expr(reason=_default_reason, lt_version=None):
+    if lt_version is not None:
+        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
+    else:
+        xfail = QUERY_PLANNING_ON
+    return pytest.mark.xfail(xfail, reason=reason)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index b55bb9d3eaf..5fbdd98225e 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -18,12 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.4.*",
+    "cudf==24.6.*",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
-    "pandas>=2.0,<2.2.2dev0",
-    "rapids-dask-dependency==24.4.*",
+    "pandas>=2.0,<2.2.3dev0",
+    "rapids-dask-dependency==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -44,7 +44,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.4.*",
+    "dask-cuda==24.6.*",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",
@@ -107,3 +107,13 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::FutureWarning",
+    "error::DeprecationWarning",
+    "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
+    # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
+    "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
+    "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+]